In this Rmarkdown report, I present the code and the analyses for the population structure and diversity statistics of a world-wide whole genome sampling of Zymoseptoria tritici. The figures are the ones used in the manuscript and the code here complements the other scripts available with the manuscripts.
library(knitr)
library(reticulate)
#Spatial analyses packages
library(raster)
library(sp)
library(rgdal)
library(maps)
library(geosphere)
library(geodist)
library(ggExtra)
#Data wrangling and data viz
library(purrr)
library(RColorBrewer)
library(plotly)
library(cowplot)
library(GGally)
library(corrplot)
library(pheatmap)
library(ggstance)
library('pophelper')
library(ggbiplot)
library(igraph)
library(ggraph)
library(ggtext)
library(scatterpie)
library(tidyverse)
library(ggExtra)
library(ggpubr)
#Pop structure and pop genomic
library(GenomicFeatures)
library(SNPRelate) #PCA
library(LEA) #Clustering
library(pophelper)
#library(PopGenome) #Summary statistics
library(gridExtra)
library(ggExtra)
library(multcomp)
library(lsmeans)
#GEA
library(lfmm)
#Statistics
library(car)
library(corrr)
library(lsmeans)
library(multcomp)
library(caret)
library(mgcv)
#Variables
world <- map_data("world")
project_dir="~/Documents/Postdoc_Bruce/Projects/WW_project/"
lists_dir = "~/Documents/Postdoc_Bruce/Projects/WW_project/WW_PopGen/Keep_lists_samples/"
#Data directories
data_dir=paste0(project_dir, "0_Data/")
metadata_dir=paste0(project_dir, "Metadata/")
fig_dir = "~/Documents/Postdoc_Bruce/Manuscripts/Feurtey_WW_Zt/Draft_figures/"
#Analysis directories
#-___________________
VAR_dir = paste0(project_dir, "1_Variant_calling/")
depth_per_window_dir = paste0(VAR_dir, "1_Depth_per_window/")
vcf_dir = paste0(VAR_dir, "4_Joint_calling/")
mito_SV = paste0(VAR_dir, "6_Mito_SV/")
PopStr_dir = paste0(project_dir, "2_Population_structure/")
nuc_PS_dir=paste0(PopStr_dir, "0_Nuclear_genome/")
mito_PS_dir = paste0(PopStr_dir, "1_Mitochondrial_genome/")
Sumstats_dir = paste0(project_dir, "3_Sumstats_demography/")
TE_RIP_dir=paste0(project_dir, "4_TE_RIP/")
RIP_DIR=paste0(TE_RIP_dir, "0_RIP_estimation/")
DIM2_DIR=paste0(TE_RIP_dir, "1_Blast_from_denovo_assemblies/")
GEA_dir=paste0(project_dir, "5_GEA/")
fung_dir=paste0(project_dir, "6_Fungicide_resistance/")
virulence_dir = paste0(project_dir, "7_Virulence/")
sel_dir = paste0(project_dir, "8_Selection/")
gene_list_dir = paste0(sel_dir, "0_Lists_unique_copy/")
#Files
vcf_name="Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.biall_SNP.max-m-1.maf-0.05.thin-1000bp"
vcf_name_nomaf="Ztritici_global_March2021.filtered-clean.AB_filtered.SNP.max-m-0.8.thin-1000bp"
vcf_name_mito = "Ztritici_global_March2021.genotyped.mt.filtered.clean.AB_filtered.variants.good_samples.max-m-80"
Zt_list = paste0(lists_dir, "Ztritici_global_March2021.genotyped.good_samples.args")
gff_file = paste0(data_dir, "Zymoseptoria_tritici.MG2.Grandaubert2015.no_CDS.gff3")
ref_fasta_file = paste0(data_dir, "Zymoseptoria_tritici.MG2.dna.toplevel.mt+.fa")
metadata_name = "Main_table_from_SQL_Feb_2020"
gene_annotation = read_tsv(paste0(data_dir, "Badet_GLOBAL_PANGENOME_TABLE.txt"))
complete_mito = read_tsv(paste0(data_dir, "Complete_mitochondria_from_blast.txt"), col_names = c("ID_file", "Contig"))
Sys.setenv(PROJECTDIR=project_dir)
Sys.setenv(VARDIR=VAR_dir)
Sys.setenv(VCFDIR=vcf_dir)
Sys.setenv(POPSTR=PopStr_dir)
Sys.setenv(MITOPOPSTR=mito_PS_dir)
Sys.setenv(SUMST=Sumstats_dir)
Sys.setenv(GEADIR=GEA_dir)
Sys.setenv(ZTLIST=Zt_list)
Sys.setenv(GFFFILE = gff_file)
Sys.setenv(REFFILE = ref_fasta_file)
Sys.setenv(VCFNAME=vcf_name)
Sys.setenv(VCFNAME_NOMAF=vcf_name_nomaf)
Sys.setenv(VCFNAME_MITO=vcf_name_mito)
#knitr::opts_chunk$set(echo = F)
knitr::opts_chunk$set(message = F)
knitr::opts_chunk$set(warning = F)
knitr::opts_chunk$set(results = T)
knitr::opts_chunk$set(dev=c('png', 'pdf'))
# Metadata and sample lists
##Filtered_samples
filtered_samples = bind_rows(
read_tsv(paste0(metadata_dir, "Sample_removed_based_on_IBS.args"), col_names = "ID_file") %>%
mutate(Filter = "IBS"),
read_tsv(paste0(metadata_dir, "Sample_with_too_much_NA.args"), col_names = "ID_file") %>%
mutate(Filter = "High_NA"),
read_tsv(paste0(metadata_dir, "Samples_to_filter_out.args"), col_names = "ID_file") %>%
mutate(Filter = "Mutants_etc"))
##Samples in vcf
genotyped_samples = read_tsv(Zt_list, col_names = "ID_file")
## Metadata of genotyped samples
temp = read_tsv(paste0(metadata_dir, metadata_name, "_Description.tab"), col_names = F) %>% pull()
Zt_meta = read_delim(paste0(metadata_dir, metadata_name, "_with_collection.tab"),
col_names = temp,
na = "\\N", guess_max = 2000) %>%
unite(Coordinates, Latitude, Longitude, sep = ";", remove = F) %>%
inner_join(., genotyped_samples) %>%
mutate(Country = ifelse(Country == "USA", paste(Country, Region, sep = "_"), Country)) %>%
mutate(Country = ifelse(Country == "Australia", paste(Country, Region, sep = "_"), Country)) %>%
mutate(Country = ifelse(Country == "NZ", "New Zealand", Country)) %>%
mutate(Country = ifelse(Country == "CH", "Switzerland", Country)) %>%
mutate(Latitude2 = round(Latitude, 2), Longitude2 = round(Longitude, 2)) %>%
dplyr::select(ID_file, Sampling_Date, Collection,
Country, Continent, Latitude, Longitude, Latitude2, Longitude2)
#genotyped_samples %>%
# filter(!(ID_file %in% filtered_samples$ID_file)) %>%
# write_tsv(Zt_list, col_names = F)
#Define colors
## For continents
#myColors <- c("#04078B", "#a10208", "#FFBA08", "#CC0044", "#5C9D06", "#129EBA","#305D1B")
myColors <- c("#DA4167", "grey", "#ffba0a", "#A20106", "#3F6E0C", "#129eba", "#8fa253" )
names(myColors) = levels(factor(Zt_meta$Continent))
Color_Continent = ggplot2::scale_colour_manual(name = "Continent", values = myColors)
Fill_Continent = ggplot2::scale_fill_manual(name = "Continent", values = myColors)
## For clustering
K_colors = c("#f9c74f", "#f9844a", "#90be6d", "#f5cac3",
"#83c5be", "#f28482", "#577590", "#e5e5e5", "#a09abc", "#52796f",
"#219ebc", "#003049", "#82c0cc", "#283618", "white")
## For correlations
mycolorsCorrel<- colorRampPalette(c("#0f8b8d", "white", "#a8201a"))(20)
Zt_meta
#Random gradients
greens=c("#1B512D", "#669046", "#8CB053", "#B1CF5F", "#514F59")
blues=c("#08386E", "#1C89C9", "#28A7C0", "#B0DFE8", "grey")
The sampling of Z.tritici isolated from the natural environment covers almost the entirety of the wheat-grown continents. It is, however, highly heterogeneous. Europe has the highest sampling density. Several locations are heavily sampled, such a fields in Switzerland or the US.
#kable(Zt_meta %>% dplyr::count(Collection, name = "Number of genomes"))
Zt_meta %>% dplyr::count(Collection, name = "Number of genomes")
## # A tibble: 18 × 2
## Collection `Number of genomes`
## <chr> <int>
## 1 BIOGER_Thierry 17
## 2 Eschikon_2017 94
## 3 ETHZ_2020 128
## 4 Fillinger 2
## 5 GWASpanel_BIOGER 90
## 6 Hartmann_FstQst_2015 132
## 7 Hartmann_Oregon_2016 94
## 8 JGI 43
## 9 JGI_1 1
## 10 JGI_2 32
## 11 JGI_3 20
## 12 JGI_4 3
## 13 JGI_Thierry 43
## 14 Megan_McDonald 13
## 15 MMcDonald_2018 92
## 16 Syngenta 283
## 17 Third_batch_2018_BM_TM 16
## 18 <NA> 6
max_circle = max(Zt_meta %>%
dplyr::count(Latitude, Longitude, name = "Number_genomes") %>%
dplyr::select(Number_genomes))
temp = Zt_meta %>%
dplyr::count(Country, Latitude, Longitude, name = "Number_genomes") %>%
filter(Number_genomes > 0)
empty_map = ggplot() + theme_void() +
geom_polygon(data = world, aes(x=long, y = lat, group = group), fill="#ede7e3", alpha=0.7)
p1 = empty_map +
geom_point(data = temp,
aes(x=as.numeric(Longitude), y=as.numeric(Latitude),size=Number_genomes, text = Country),
alpha = 0.6, color = "#ff9f1c") +
scale_size("Number of genomes", limits = c(1, max_circle)) +
coord_cartesian(xlim=c(-20, 60), ylim=c(20, 70)) +
theme(legend.position = "none")
p2 = empty_map +
geom_point(data = temp,
aes(x=as.numeric(Longitude), y=as.numeric(Latitude),size=Number_genomes, text = Country),
alpha = 0.6, color = "#ff9f1c") +
scale_size("Number of genomes", limits = c(1, max_circle)) + coord_cartesian(xlim=c(-160, -40), ylim=c(-80, 80)) +
theme(legend.position = "none")
p3 = empty_map +
geom_point(data = temp,
aes(x=as.numeric(Longitude), y=as.numeric(Latitude),size=Number_genomes, text = Country),
alpha = 0.6, color = "#ff9f1c") +
scale_size("Number of genomes", limits = c(1, max_circle)) + coord_cartesian(xlim=c(115, 175), ylim=c(-60, 10))
#Plotting all the maps together!
aus_map = cowplot::plot_grid(p3+ theme(legend.position = "none"), get_legend(p3), ncol = 2, rel_widths = c(1, 0.9))
ligne = cowplot::plot_grid(p1, aus_map, ncol = 1, rel_heights = c(1, 0.7))
cowplot::plot_grid(p2, ligne, ncol = 2, rel_widths = c(0.7, 1))
In the analyses, I will often use countries as a geographical unit, with the exception of the United States, which will instead be divided in states.
countries = dplyr::count(Zt_meta, Country, Continent)
countries %>% filter(n >= 8) %>%
ggplot(aes(x = Country, y =n, fill = Continent)) +
geom_bar(stat = "identity") +
Fill_Continent +
theme_cowplot() +
labs(x = element_blank(), y = "Number of samples") +
theme(axis.text.x = element_text(angle = 40, hjust = 1, vjust = 1))
for (country in filter(countries, n >= 8) %>% pull(Country)) {
country_name = str_replace(country, pattern = " ", replacement = "_")
file_name = paste0(PopStr_dir, "Sample_list_", country_name, ".args")
Zt_meta %>% filter(Country == country) %>%
dplyr::select(ID_file) %>%
write_tsv(file = file_name, col_names = F)
}
In some cases, countries are too high level, so I subdivide them into geoclusters, centered on rounded coordinates. These geoclusters have a lower limit in terms of sample number. TODO: remove the limit that I never use. There is little need for both.
for ( min_number in c(6, 10) ) {
small_pops = Zt_meta %>%
filter(!is.na(Country) & !is.na(Sampling_Date)) %>%
dplyr::count(Country, Latitude2, Longitude2, Sampling_Date, name = "Number_genomes") %>%
filter( Number_genomes >= min_number )
map1 = ggplot() +
theme_void() +
geom_polygon(data = map_data("world"), aes(x=long, y = lat, group = group), fill="#ede7e3") +
scale_size("Number of genomes", limits = c(1, max_circle)) +
theme(legend.position = "None",
panel.border = element_rect(fill=NA, colour = "grey",size = 0.5, linetype = 1))+
scale_fill_manual(values = c("grey", K_colors) )
p1 = map1 + coord_cartesian(xlim=c(-20, 60), ylim=c(20, 70)) +
geom_point(data = small_pops,
mapping = aes(x=Longitude2, y=Latitude2, size=Number_genomes),
alpha = 0.6, color = "#2ec4b6")
p2 = map1 + coord_cartesian(xlim=c(-160, -40), ylim=c(-80, 80)) +
geom_point(data = small_pops,
mapping = aes(x=Longitude2, y=Latitude2, size=Number_genomes),
alpha = 0.6, color = "#2ec4b6")
p3 = map1 + coord_cartesian(xlim=c(115, 175), ylim=c(-65, 10)) +
geom_point(data = small_pops,
mapping = aes(x=Longitude2, y=Latitude2, size=Number_genomes),
alpha = 0.6, color = "#2ec4b6")
aus_map = cowplot::plot_grid(p3, get_legend(p3), ncol = 2, rel_widths = c(1, 0.9))
ligne = cowplot::plot_grid(p1, aus_map, ncol = 1, rel_heights = c(1, 0.7))
cowplot::plot_grid(p2, ligne, ncol = 2, rel_widths = c(0.7, 1))
for (t in 1:nrow(small_pops)) {
country = pull(small_pops[t, "Country"])
year = pull(small_pops[t, "Sampling_Date"])
lat = pull(small_pops[t, "Latitude2"])
long = pull(small_pops[t, "Longitude2"])
country_name = str_replace(country, pattern = " ", replacement = "_")
for (i in 1:10 ){
file_name = paste0(PopStr_dir, "Sample_list_subset_",
paste(min_number, country_name, year, lat, long, i, sep = "_"),
".args")
temp = Zt_meta %>%
filter(Country == country) %>%
filter(Sampling_Date == year) %>%
filter(Latitude2 == lat) %>%
filter(Longitude2 == long) %>%
dplyr::select(ID_file) %>%
pull()
write_tsv(as.tibble(sample(temp, size = min_number)), file = file_name, col_names = F)
}
}
}
The sampling also covers a wide range of years: starting from 1990 to 2017. Just as with the geographical repartition, some years are heavily sampled, reflecting sampling in specific fields done for previous experiments.
temp = as_tibble(c(min(Zt_meta$Sampling_Date, na.rm = T) : max(Zt_meta$Sampling_Date, na.rm = T))) %>%
mutate(`Sampling year` = as.character(value))
sum_temp = Zt_meta %>%
mutate(`Sampling year` = as.character(Sampling_Date)) %>%
dplyr::count(`Sampling year`, Continent) %>%
full_join(., temp) %>%
mutate(`Genome number` = replace_na(n, 0))
sum_temp %>%
ggplot(aes(x=`Sampling year`, y =`Genome number`, fill = Continent)) +
geom_bar(stat = "identity") +
theme_bw() + theme(axis.title = element_blank(),
axis.text.x = element_text(angle = 60, hjust = 1)) +
Fill_Continent
I have blasted the known sequences of the two mating type genes against each of the assemblies and will now gather all of this data in a summary table.
mat1_c=0
mat2_c=0
while read sample ;
do
if [ -f /data2/alice/WW_project/2_Population_structure/Mat1_1_${sample}.blast_out.tsv ] ;
then
mat1="OK" ;
mat1_c=$(($mat1_c + 1))
else
mat1="NO" ;
fi ;
if [ -f "/data2/alice/WW_project/2_Population_structure/Mat1_2_${sample}.blast_out.tsv" ] ;
then
mat2="OK";
mat2_c=$(($mat2_c + 1))
else
mat2="NO" ;
fi ;
echo $sample $mat1 $mat2 ;
done < Keep_lists_samples/Ztritici_global_March2021.genotyped.good_samples.args
echo $mat1_c $mat2_c
mat_genes_blast = read_delim(paste0(PopStr_dir, "Mat1_blast_all.tsv"), delim = " ",
col_names = c("ID_file", "gene", "qseqid", "sseqid", "pident", "length",
"mismatch", "gapopen", "qstart", "qend",
"sstart", "send", "evalue", "bitscore")) %>%
distinct() %>%
dplyr::inner_join(Zt_meta) %>%
group_by(ID_file, gene, Continent, Country, Latitude2, Longitude2) %>%
dplyr::mutate(Cum_length = sum(length))%>% ungroup()
mat_genes = ungroup(mat_genes_blast) %>%
dplyr::group_by(ID_file, gene, Continent, Country, Latitude2, Longitude2) %>%
dplyr::summarize(Cum_length = sum(length)) %>% ungroup()
dplyr::count(mat_genes, gene, Cum_length)
## # A tibble: 23 × 3
## gene Cum_length n
## <chr> <dbl> <int>
## 1 Mat_1_1 182 2
## 2 Mat_1_1 801 1
## 3 Mat_1_1 859 1
## 4 Mat_1_1 886 1
## 5 Mat_1_1 893 2
## 6 Mat_1_1 895 1
## 7 Mat_1_1 898 517
## 8 Mat_1_1 899 5
## 9 Mat_1_1 900 1
## 10 Mat_1_1 904 1
## # … with 13 more rows
mat_genes %>%
ggplot(aes(Cum_length)) +
geom_density() +
theme_bw() +
facet_wrap(vars(gene)) +
labs(x = "Cumulative length of blast matches", y = "Density",
title = "Sanity check for the blast search using the match length")
mat11_size = median(filter(mat_genes, gene == "Mat_1_1")$Cum_length)
mat12_size = median(filter(mat_genes, gene == "Mat_1_2")$Cum_length)
mat_genes_blast = mat_genes_blast %>%
mutate(Filtering = case_when(gene == "Mat_1_1" & Cum_length >= mat11_size - 20 & Cum_length <= mat11_size + 20 ~ "Keep",
gene == "Mat_1_2" & Cum_length >= mat12_size - 20 & Cum_length <= mat12_size + 20 ~ "Keep",
TRUE ~ "Filter_out"))
# Count and plot per country
temp = mat_genes_blast %>%
dplyr::select(ID_file, gene, Continent, Country) %>%
distinct() %>%
dplyr::count(Continent, Country, gene) %>%
filter(!is.na(Continent))
dplyr::filter(temp, !is.na(Country)) %>%
dplyr::group_by(Continent, Country) %>%
dplyr::mutate(Total = sum(n)) %>%
filter(Total > 6) %>%
ggplot(aes(x = Country, y = n, fill = gene))+
geom_bar(stat = "identity", position = "fill") +
geom_hline(yintercept = .5) +
theme_bw() +
labs(y = "Number of isolates",
title = "Mat genes in countries including more than 6 samples",
fill = "Mat type", x = "") +
scale_fill_manual(values = c("#CBF3F0", "#2EC4B6")) +
coord_flip()
Question: How prelavent is aneuploidy in natural populations of Z.tritici? In the case of accessory chromosomes, is there a correlation between phylogeny, environment, host or time and the presence/absence of some chromosomes?
Methods: Based on the depth of coverage for all samples, we can identify for both core and accessory chromosomes whether each isolates includes 1, 0 or several copies.
core_depth = read_tsv(paste0(depth_per_window_dir, "Depth_per_sample_core_chr_summary.q30.txt")) %>%
mutate(Median_core = Median)
chrom_depth = read_tsv(paste0(depth_per_window_dir, "Depth_per_chromosome_summary.q30.txt")) %>%
left_join(., core_depth %>%
dplyr::select(Sample, Median_core)) %>%
mutate(Relative_depth = Median/Median_core) %>%
left_join(.,Zt_meta, by = c("Sample" = "ID_file")) %>%
filter(CHROM != "mt") %>%
mutate(Sample = fct_reorder(Sample, Sampling_Date)) %>%
mutate(Sample = fct_reorder(Sample, Country)) %>%
mutate(Sample = fct_reorder(Sample, Continent))
heatmap_depth = chrom_depth %>%
filter(CHROM != "mt") %>%
ggplot(aes(x = as.numeric(CHROM), y=Sample, fill=Relative_depth)) +
geom_tile() + scale_fill_gradient2(low="white", high = "#2ec4b6") +
geom_vline(xintercept = 13.5, linetype = "longdash", colour = "gray20") +
theme(axis.text.y = element_blank(),
axis.title.y = element_blank(), axis.ticks.y=element_blank()) +
labs(fill = "Depth", x= "Chromosome") + xlim(c(0.5, 21.5))
plot_continent = chrom_depth %>%
ggplot(aes(x = 1, y=Sample, fill=Continent)) +
geom_tile(aes(width = 2)) +
theme_classic() +
theme(axis.text.y = element_blank(), axis.ticks.y=element_blank(),
legend.position="left",
axis.text.x=element_text(colour="white")) +
labs(y= "Isolate") + Fill_Continent
plot_grid(plot_continent, heatmap_depth, rel_widths = c(2, 5))
In the heatmap, I represent the depth normalized by the median depth over all core chromosomes. As expected, the copy-number variation at the chromosome scale affects mostly the accessory chromosomes (AC). There is some presence of supernumerary AC and a lot of presence-absence variation. Supernumerary chromosomes can also be found in the core chromosomes but this is almost anecdotal as over the whole sampling this was found only in 9 cases.
Lthres = 0.50
Hthres = 1.50
depth = chrom_depth %>%
dplyr::filter(!is.na(Relative_depth)) %>%
dplyr::mutate(Depth_is = ifelse(Relative_depth > Hthres, "High", ifelse(Relative_depth < Lthres, "Low", "Normal"))) %>%
mutate(CHROM = fct_reorder(CHROM, as.numeric(CHROM)))
bar_Ndepth_per_CHR =ggplot(depth, aes(x = CHROM, fill = Depth_is)) +
geom_bar(stat = "count") +
scale_fill_manual(values =c( "#2ec4b6", "#cbf3f0","#EDE7E3")) +
theme_light()+
labs(x= "Chromosome", y = "Number of isolates")
#lollipop plots
##For high normalized depth values
temp = depth %>%
filter(Depth_is == "High") %>%
dplyr::group_by(CHROM) %>%
dplyr::count() %>%
mutate(CHROM = fct_reorder(CHROM, as.numeric(CHROM)))
lolhigh = ggplot(temp, aes(x = as.character(CHROM), y = n)) +
geom_segment( aes(x=as.character(CHROM), xend=as.character(CHROM), y=0, yend=max(temp$n)),
color="grey80", size = 1) +
geom_segment( aes(x=as.character(CHROM), xend=as.character(CHROM), y=0, yend=n),
color="grey20", size = 1) +
geom_point( color="#2ec4b6", size=4) +
geom_text(aes( label = n,
y= n), stat= "identity",
hjust = -0.5, vjust = -0.2) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10)
) +
ylim(c(0,max(temp$n)+2+ max(temp$n)*0.1)) +
labs(x= "Chromosome", y = "Number of isolates with supernumerary chromosome") +
coord_flip()
##For low normalized depth values
temp = depth %>%
filter(Depth_is == "Low") %>%
dplyr::group_by(CHROM) %>%
dplyr::count()%>%
mutate(CHROM = fct_reorder(CHROM, as.numeric(CHROM)))
lollow = ggplot(temp, aes(x = CHROM, y = n)) +
geom_segment( aes(x=CHROM, xend=CHROM, y=0, yend=max(temp$n)),
color="grey80", size = 1) +
geom_segment( aes(x=CHROM, xend=CHROM, y=0, yend=n),
color="grey20", size = 1) +
geom_point( color="#cbf3f0", size=4) +
geom_text(aes( label = n,
y= n), stat= "identity",
hjust = -0.5, vjust = -0.2) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10)
) +
ylim(c(0,max(temp$n)+ max(temp$n)*0.1)) +
labs( x= "Chromosome", y = "Number of isolates without chromosome") +
coord_flip()
bottom_row <- cowplot::plot_grid(lolhigh, lollow, labels = c('B', 'C'), label_size = 12)
plot_grid(bar_Ndepth_per_CHR, bottom_row, labels = c('A', ''), label_size = 12, ncol = 1)
Here is a table including the isolates with supernumerary core chromosomes.
depth %>%
dplyr::filter(Depth_is == "High") %>%
dplyr::filter(as.numeric(CHROM) < 13) %>%
dplyr::select(Sample, CHROM, Median, Median_core, Collection, Country)
## # A tibble: 18 × 6
## Sample CHROM Median Median_core Collection Country
## <fct> <fct> <dbl> <dbl> <chr> <chr>
## 1 08STCZ011 12 47 24 Syngenta Czech …
## 2 08STF035 5 154 79 Syngenta France
## 3 08STF036 12 159 82 Syngenta France
## 4 09STD041 4 157 86 Syngenta Germany
## 5 ST00ARG_BD0069 5 17 9 <NA> <NA>
## 6 ST13SP_Biog_SeptoDur104 4 100 49 BIOGER_Thierry Spain
## 7 ST16CH_2X10 1 2 1 <NA> <NA>
## 8 ST16CH_2X10 2 2 1 <NA> <NA>
## 9 ST16CH_2X10 3 2 1 <NA> <NA>
## 10 ST16DK_Biog_DK15 5 16 8 <NA> <NA>
## 11 ST90ORE_a12_3B_11 12 32 19 Hartmann_FstQst_2015 USA_Or…
## 12 STnnJGI_SRR4235099 12 28 17 JGI_2 USA_Or…
## 13 STnnJGI_SRR4235109 5 33 17 JGI_2 Switze…
## 14 STnnJGI_SRR5194526 12 78 40 JGI Sweden
## 15 STnnJGI_SRR5194528 5 84 42 JGI Sweden
## 16 STnnJGI_SRR5194605 5 73 37 JGI Hungary
## 17 STnnJGI_SRR5829692 4 165 84 JGI Kenya
## 18 STnnJGI_SRR5829692 5 163 84 JGI Kenya
And an overlook of the accessory chromosomes PAV per continent (only considering continents with more than 10 isolates).
depth %>%
dplyr::filter(as.numeric(CHROM) > 13) %>%
filter(!is.na(Continent)) %>%
dplyr::group_by(Continent, CHROM) %>%
dplyr::mutate(Count_sample_per_continent = n()) %>%
dplyr::filter(Count_sample_per_continent >= 10) %>%
ggplot(aes(x = Continent, fill = Depth_is)) +
geom_bar(position = "fill" ) + facet_wrap(CHROM~.) +
theme_light() +
scale_fill_manual(values = c("#2ec4b6", "#cbf3f0", "#EDE7E3")) +
theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
ylab("Proportion of chromosomes") + xlab("")
Question: Is the world-wide population of Z.tritici structured? If so, is it structured according to geography, host or time (or any other relevant info we hopefully have)?
Previous genomic work has shown very clear structure between populations of Z.tritici. However,the sampling was extremely heterogeneous. With a more geographically even sampling, do we also observe a clear-cut structure?
Methods: The methods I chose to use to investigate the structure were the following:
The clustering here is done by using the snmf method from the LEA R package (http://membres-timc.imag.fr/Olivier.Francois/LEA/) on the same subset of SNPs as the PCA, but without any missing data. I ran the analysis for a K (number of cluster inferred) ranging from 1 to 15 and with 10 repeats for each K.
vcftools --vcf ${VCFDIR}$VCFNAME.recode.vcf \
--extract-FORMAT-info GT \
--out ${POPSTR}$VCFNAME
cat ${POPSTR}$VCFNAME.GT.FORMAT | cut -f 3- \
> ${POPSTR}$VCFNAME.GT.FORMAT2
cat ${POPSTR}$VCFNAME.GT.FORMAT | cut -f 1,2 \
> ${POPSTR}$VCFNAME.GT.FORMAT.pos
head -n1 ${POPSTR}$VCFNAME.GT.FORMAT2 | gsed "s/\t/\n/g" \
> ${POPSTR}$VCFNAME.ind
gsed "s/\t//g" ${POPSTR}$VCFNAME.GT.FORMAT2 | tail -n +2 \
> ${POPSTR}$VCFNAME.geno
datamash transpose < ~/Documents/Postdoc_Bruce/Projects/WW_project/2_Population_structure/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.biall_SNP.max-m-1.maf-0.05.thin-1000bp.even_sampling.GT.FORMAT2 | sed 's/^/>/' | sed 's/\t/\n/' | sed 's/\t//g' | cut -c -150 > ~/Documents/Postdoc_Bruce/Projects/WW_project/2_Population_structure/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.biall_SNP.max-m-1.maf-0.05.thin-1000bp.even_sampling.fasta
#project = snmf(paste0(PopStr_dir, vcf_name, ".geno"), K=1:15, entropy = TRUE,
# repetitions = 10, project = "new", ploidy = 1)
# Reading the results from the snmf runs
# ______________________________________
#Sample names
indv_snmf = read_tsv(paste0(PopStr_dir, vcf_name, ".ind"), col_names = F)
names(indv_snmf) = "Sample"
#Load project
project = load.snmfProject(paste0(PopStr_dir, vcf_name, ".snmfProject"))
K_list = c(1:15)
#Extracting the clustering info for the best run per K
datalist = list()
for (i in K_list){
best = which.min(cross.entropy(project, K = i))
temp = as.data.frame(Q(project, i, best))
temp= cbind(indv_snmf, temp)
temp = temp %>%
gather("Cluster", "Admix_coef", -"Sample") %>%
mutate(K=i)
datalist[[i]] = as.tibble(temp)
}
snmf_results_per_K = bind_rows(datalist) %>%
inner_join(., Zt_meta, by = c("Sample" = "ID_file")) %>%
unite(Continent, Country, col = "for_display", remove = F)
#sNMF pretty plots
# _______________
afiles = character(length(K_list))
for (i in K_list){
best = which.min(cross.entropy(project, K = i))
afiles[i] = Sys.glob(paste0(PopStr_dir, vcf_name, ".snmf/K",i, "/run", best, "/*Q"))
}
# create a qlist
qlist <- readQBasic(afiles)
al_qlist = alignK(qlist)
lab_set = inner_join(indv_snmf, Zt_meta, by = c("Sample" = "ID_file")) %>%
dplyr::select(Continent, Country) %>%
mutate(Continent = ifelse(is.na(Continent), "Unknown", Continent),
Country = ifelse(is.na(Country), "Unknown", Country))
#Low numbers
from = 2
up_to = 6
p1 <- plotQ(alignK(qlist[from:up_to], type = "across"),
imgoutput="join",
returnplot=T,exportplot=F,
basesize=11,
splab= paste0("K=", K_list[from:up_to]),
grplab=lab_set, ordergrp=T, grplabangle = 40, grplabheight = 2, grplabsize = 2,
clustercol = K_colors)
grid.arrange(p1$plot[[1]])
#Medium numbers
from = 7
up_to = 11
p2 <- plotQ(alignK(qlist[from:up_to], type = "across"),
imgoutput="join",
returnplot=T,exportplot=F,
basesize=11,
splab= paste0("K=", K_list[from:up_to]),
grplab=lab_set, ordergrp=T, grplabangle = 40, grplabheight = 2, grplabsize = 2,
clustercol = K_colors)
grid.arrange(p2$plot[[1]])
#High numbers
from = 12
up_to = 15
p3 <- plotQ(alignK(qlist[from:up_to], type = "across"),
imgoutput="join",
returnplot=T,exportplot=F,
basesize=11,
splab= paste0("K=", K_list[from:up_to]),
grplab=lab_set, ordergrp=T, grplabangle = 40, grplabheight = 2, grplabsize = 2,
clustercol = K_colors)
grid.arrange(p3$plot[[1]])
I need to identify the isolates which belong to each of the clusters. For this, we need to set a threshold, since there are very few (or even no) isolates assigned fully to one only. I test two thresholds: isolates assigned to one cluster with a value higher than 0.75 and 0.9.
#Setting thresholds to compare
chosen_threshold = 0.75
chosen_threshold2 = 0.9
pure_by_threshold = bind_rows(snmf_results_per_K %>%
filter(Admix_coef > chosen_threshold) %>%
mutate(Threshold = paste0("> ", chosen_threshold)),
snmf_results_per_K %>%
filter(Admix_coef > chosen_threshold2) %>%
mutate(Threshold = paste0("> ", chosen_threshold2)))
# Number of pure isolates per K
pure_by_threshold %>%
dplyr::group_by(K, Threshold) %>%
dplyr::count() %>%
ggplot(aes(x = as.factor(K), y = n, fill = Threshold)) +
geom_bar(stat = "identity", position = "dodge") +
theme_bw() + scale_fill_manual(values = c("#2ec4b6", "#cbf3f0"))+
labs(x = "K", y = "Number of isolates", title = "Pure isolates per K")
# Number of pure isolates per K per country
temp2 = pure_by_threshold %>%
dplyr::group_by(Continent, for_display, Cluster, K, Threshold) %>%
dplyr::count()
temp2 %>%
filter(K > 9 & K < 16) %>%
ggplot(aes(x = Cluster, y = for_display,
size = n, color = Continent)) +
geom_point(alpha = 0.3) + Color_Continent+ theme_bw() +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
facet_grid(rows = vars(Threshold), cols = vars(K), scales = "free_x") +
labs(x = "", y = "", size = "Nb of isolates",
title = "Pure isolates per country and per K")
Choosing the number K of clusters that best represent the data at hand is always a challenge. First, let’s look at the cross-validation results. sNMF estimates an entropy criterion which evaluates the quality of fit of the model to the data, potentially helping to find the best number of ancestral populations.
#Dot plot of the cross-entropy
#plot(project, col = "goldenrod", pch = 19, cex = 1.2) #native method
cross_ent = list()
for (i in K_list){
cross_ent[[i]] = as_tibble(cross.entropy(project, K = i), rownames = "NA") %>%
mutate( K = i)
colnames(cross_ent[[i]] ) <- c("Run", "Crossentropy", "K")
}
cross_ent = bind_rows(cross_ent)
temp = cross_ent %>% group_by(K) %>% dplyr::summarize(average = mean(Crossentropy), minimum = min(Crossentropy))
p1 = ggplot() +
geom_point(data = cross_ent, aes(x = as.factor(K), y = Crossentropy), alpha = 0.4, size = 2, col = "#cbf3f0") +
geom_point(data = temp, aes(x = as.factor(K), y = minimum), alpha = 0.4, size = 2, col = "#2ec4b6") +
theme_cowplot() +
labs(x = "K", y = "Cross-entropy")
p2 = pure_by_threshold %>%
filter(Threshold == paste0("> ", chosen_threshold)) %>%
filter(K > 1) %>%
dplyr::group_by(Cluster, K) %>%
dplyr::count()%>%
group_by(K) %>%
dplyr::summarize(Size_smallest_cluster = min(n)) %>%
ggplot(aes(x = as.factor(K), label = Size_smallest_cluster)) +
geom_bar(aes(y = Size_smallest_cluster, fill = K), stat = "identity") +
theme_cowplot() +
geom_label(aes(y = Size_smallest_cluster + 7)) +
scale_fill_continuous(high = "#2ec4b6", low = "#cbf3f0") +
labs(x = "K", y = "Size of the smallest cluster")
p3 = pure_by_threshold %>%
filter(Threshold == paste0("> ", chosen_threshold)) %>%
dplyr::group_by(K) %>%
dplyr::count() %>%
ggplot(aes(x = as.factor(K), y = n, fill = K)) +
geom_bar(stat = "identity", position = "dodge") +
theme_bw() +
scale_fill_continuous(high = "#2ec4b6", low = "#cbf3f0") +
labs(x = "K", y = "Number of assigned samples")
top_row = cowplot::plot_grid(p1, p3 + theme(legend.position = "none"), ncol = 2)
cowplot::plot_grid(top_row, p2 + theme(legend.position = "none"), ncol = 1)
In the case of our analysis, we do not have a very clear-cut minimum value for the cross-entropy criterion value. There is however a plateau starting from K=10. I also look at the number of samples assigned to one cluster or another based on the threshold set previously, as well as at the smallest cluster size. If the cluster size is very small, I would not be able to get meaningful information about them. However, this might be a hint that very real clusters exist in areas where our sampling is too sparse. This is an area to explore further in the future with more comprehensive sampling across the globe.
Based on the information contained in the plots above, I chose to proceed with 11 genetic clusters. I will write different tables to keep the information related to this clustering. For many analyses, the number of samples has an effect of the value computed, so I also create subsets of samples based on the number of isolates from the smallest clusters. For each cluster, I randomly draw 10 times the isolates.
#These values are chosen based on the plot above
chosen_K = 11
write_tsv(snmf_results_per_K %>% filter(K == chosen_K),
file = paste0(PopStr_dir, vcf_name, ".snmf_results_chosen_K.tab"))
#Looking at individuals with admixture coef higher than the threshold defined above.
high_anc_coef_snmf = snmf_results_per_K %>%
filter(K == chosen_K) %>%
filter(Admix_coef > chosen_threshold)
##Writing out tables for later
high_anc_coef_snmf %>% dplyr::select(Sample) %>%
write_tsv(., file = paste0(PopStr_dir, vcf_name, ".high_anc_coef_snmf.ind"),
col_names = F)
high_anc_coef_snmf %>%
write_tsv(., file = paste0(PopStr_dir, vcf_name, ".high_anc_coef_snmf.tsv"),
col_names = T)
# Making lists of samples per cluster for use later
max_continent = dplyr::count(high_anc_coef_snmf, Cluster, Continent) %>%
group_by(Cluster) %>%
mutate(somme = sum(n), prop = n/somme) %>%
filter(prop > 0.5) %>%
dplyr::select(Cluster, Continent)
clusters = dplyr::count(high_anc_coef_snmf, Cluster, Continent)
## Files with all pure isolates per cluster
for (cluster in max_continent %>% pull(Cluster)) {
file_name = paste0(PopStr_dir, "Sample_list_", cluster, ".args")
high_anc_coef_snmf %>% filter(Cluster == cluster) %>%
dplyr::select(Sample) %>%
write_tsv(file = file_name, col_names = F)
}
## Files with a similar number pure isolates between cluster
minimum_cluster_size = min(dplyr::count(high_anc_coef_snmf, Cluster) %>% dplyr::select(n))
for (cluster in max_continent %>% pull(Cluster)) {
for (i in 1:10 ){
file_name = paste0(PopStr_dir, "Sample_list_", cluster, "_", i, ".args")
temp = high_anc_coef_snmf %>%
filter(Cluster == cluster) %>%
sample_n(size = minimum_cluster_size) %>%
dplyr::select(Sample)
write_tsv(temp, file = file_name, col_names = F)
}
}
Once a adequate number of cluster is selected, I investigate their geographical distribution.
#Table of pure samples numbers per continent
kable(snmf_results_per_K %>%
filter(K == chosen_K) %>%
filter(Admix_coef > chosen_threshold) %>%
dplyr::group_by(Continent, Cluster) %>%
dplyr::count() %>%
pivot_wider(names_from = Continent, values_from =n, values_fill = 0))
| Cluster | Africa | Europe | Middle East | North America | Oceania | South America | NA |
|---|---|---|---|---|---|---|---|
| V11 | 21 | 8 | 2 | 0 | 0 | 0 | 0 |
| V2 | 4 | 553 | 0 | 0 | 0 | 0 | 2 |
| V6 | 0 | 0 | 40 | 0 | 0 | 0 | 0 |
| V7 | 0 | 0 | 16 | 0 | 0 | 0 | 0 |
| V10 | 0 | 0 | 0 | 178 | 0 | 0 | 1 |
| V4 | 0 | 0 | 0 | 1 | 36 | 0 | 0 |
| V8 | 0 | 0 | 0 | 20 | 0 | 0 | 0 |
| V1 | 0 | 0 | 0 | 0 | 43 | 0 | 0 |
| V3 | 0 | 0 | 0 | 0 | 31 | 0 | 0 |
| V5 | 0 | 0 | 0 | 0 | 0 | 31 | 0 |
| V9 | 0 | 0 | 0 | 0 | 0 | 16 | 0 |
##Bubble plot
p_cluster = high_anc_coef_snmf %>%
dplyr::group_by(Continent, for_display, Cluster) %>%
dplyr::count() %>%
ggplot(aes(x = for_display, y = Cluster,
size = n, color = Continent)) +
geom_point(alpha = 0.5) + Color_Continent+ theme_bw() +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
labs(x = "",
title = "Pure isolates per country for the chosen K value",
subtitle = str_wrap(paste0("Number of genotypes with admix coef > ", chosen_threshold,
" assigned to ", chosen_K, " clusters."),
width = 70),
size = "Nb of isolates")
p_cluster
The clustering data can also be represented as an average of ancestry coefficient per country. This is done here first with barplots.
#Setting data with the chosen number of K as well as the chosen coefficient threshold
chosen_coef = snmf_results_per_K %>%
filter(K == chosen_K) %>%
filter(Country != "NA") %>%
filter(Country != "USA_NA") %>%
dplyr::select(Sample, Continent, Country, Admix_coef, Cluster, Latitude, Longitude)
K_colors = c("#8ECAE6", #V1 Aus (TAS)
"#49810E", #V10 USA
"#E16684", #V11 North Africa
"#FFBA0A", #V2 Europe
"#126782", #V3 NZ
"#219EBC", #V4 Australia (NSW)
"#8FA253", #V5 Uruguay + Argentina
"#650104", #V6 Israel + Turkey
"#DE020A", #V7 Iran
"#2A4908", #V8 Canada
"#B3C186") #V9 Boliva + Chile + Ecuador
#Identifying the main cluster per country
cluster_per_country = high_anc_coef_snmf %>%
group_by(Country, Cluster) %>%
dplyr::summarise(n = n()) %>%
dplyr::mutate(freq = n / sum(n)) %>%
filter(freq > 0.5) %>%
dplyr::select(Country, Main_country_cluster = Cluster)
# Cluster composition per country for all continents
temp = chosen_coef %>%
group_by(Continent, Country, Cluster) %>%
dplyr::summarize(average_coef = mean(Admix_coef),
Nb_per_country = n()) %>%
dplyr::mutate(Cluster2 = ifelse(average_coef < 0.1, "Other", Cluster)) %>%
filter(Nb_per_country >= 6)
#Bar plot per country
temp %>% filter(Continent != "NA") %>%
filter(Continent != "Asia") %>%
ggplot(aes(x = Country, y = average_coef, fill = Cluster2)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("grey", K_colors) ) +
theme_cowplot() +
labs(subtitle = "All continents, limited to countries with 6 or more isolates",
title = "Cluster ancestry per country",
y = "Average of the ancestry coefficient", fill = "Cluster") +
facet_grid(cols = vars(Continent), switch = "x", scales = "free_x", space = "free_x") +
theme(axis.title.x = element_blank(),
axis.title.y = element_text(size = 12),
axis.text.x = element_text(angle = 50, hjust = 1, vjust = 1, size = 8),
axis.text.y = element_text(hjust = 1, vjust = 1, size = 10),
panel.spacing = unit(0, "lines"),
strip.background = element_blank(),
strip.placement = "bottom",
strip.text = element_text(size = 10),
legend.position = "bottom")
Such visualization is useful but not as intuitive as a map! Let’s use the same summarising method but with a more local viewpoint (I use rounded coordinates instead of country).
#Summarizing based on samples found in neighbouring areas
temp = chosen_coef %>%
mutate(Latitude = round(Latitude/2)*2, Longitude = round(Longitude/2)*2)%>%
group_by(Continent, Country, Cluster, Longitude, Latitude) %>%
summarize(average_coef = mean(Admix_coef), number_of_isolates = n()) %>%
filter(!is.na(Latitude))
write_tsv(temp, file = paste0(PopStr_dir, vcf_name, ".chosen_coef_pies.tab"))
#Transforming to fit the scatter pie requirements
pies = temp %>% #filter(number_of_isolates > 2) %>%
mutate(Cluster2 = ifelse(average_coef < 0.1, "Other", Cluster)) %>%
group_by(Continent, Longitude, Latitude, Country, Cluster2, number_of_isolates) %>%
dplyr::summarize(to_plot = sum(average_coef)) %>%
arrange(Cluster2) %>%
pivot_wider(names_from = Cluster2, values_from = to_plot, values_fill = 0) %>%
mutate(radius = ifelse(number_of_isolates > 10, 1.5, log(number_of_isolates))) %>%
dplyr::select(-number_of_isolates)
# Simple map of the world
map1 = ggplot() +
theme_void() +
geom_polygon(data = map_data("world"), aes(x=long, y = lat, group = group), fill="#ede7e3") +
scale_size("Number of genomes", limits = c(1, max_circle)) +
theme(legend.position = "None",
panel.border = element_rect(fill=NA, colour = "grey",size = 0.5, linetype = 1)) +
scale_fill_manual(values = c(c("grey"), K_colors))
#Splitting the map into our 3 main focus areas
p1 = map1 + coord_cartesian(xlim=c(-20, 60), ylim=c(20, 70)) +
geom_scatterpie(data = pies, mapping = aes(x=Longitude, y=Latitude, group=Country, r = radius),
cols = c("Other", "V1", "V10", "V11", "V2", "V3", "V4",
"V5", "V6", "V7", "V8", "V9"), color=NA, alpha=.8)
p2 = map1 + coord_cartesian(xlim=c(-160, -40), ylim=c(-80, 80)) +
geom_scatterpie(data = pies, mapping = aes(x=Longitude, y=Latitude, group=Country, r = radius*2),
cols = c("Other", "V1", "V10", "V11", "V2", "V3", "V4",
"V5", "V6", "V7", "V8", "V9"), color=NA, alpha=.8)
p3 = map1 + coord_cartesian(xlim=c(115, 185), ylim=c(-65, 10)) +
geom_scatterpie(data = pies, mapping = aes(x=Longitude, y=Latitude, group=Country, r = radius*2),
cols = c("Other", "V1", "V10", "V11", "V2", "V3", "V4",
"V5", "V6", "V7", "V8", "V9"), color=NA, alpha=.8)
#Plotting all the maps together!
aus_map = cowplot::plot_grid(p3, get_legend(p1), ncol = 2, rel_widths = c(1, 0.75))
ligne = cowplot::plot_grid(p1, aus_map, ncol = 1, rel_heights = c(1, 0.7))
cowplot::plot_grid(p2, ligne, ncol = 2, rel_widths = c(0.8, 1))
ggsave(paste0(fig_dir, "Str_scatter_pie.pdf"), width = 18, height = 10, units = "cm")
#p2 + theme(
# legend.position = c(.05, .05),
# legend.justification = c("left", "bottom"),
# legend.box.just = "left",
# legend.margin = margin(1,1,1,1)
# )+scale_color_discrete(guide="none")
For each country, we define the local cluster with “votes”: the cluster to which more than half of the non-admixed isolates are originating is considered to be the local cluster. We can quantify the isolates which are non-admixed (or “pure”) isolates from the local cluster, non-admixed from another cluster, or hybrid.
#For each sample, identify if is hybrid, local pure or pure from somewhere else.
status_admix = left_join(chosen_coef, cluster_per_country) %>%
mutate(Main_country_cluster = ifelse(!is.na(Main_country_cluster), Main_country_cluster, "No_main")) %>%
group_by(Sample, Continent, Country) %>%
dplyr::mutate(max_admix = max(Admix_coef),
max_cluster = ifelse(Admix_coef == max_admix, Cluster, "")) %>%
filter(max_cluster != "") %>%
dplyr::mutate(Status = ifelse(max_admix < chosen_threshold, "Hybrid",
ifelse(max_cluster == Main_country_cluster, "Pure local",
"Pure other")))
# Making a pie chart summary
# Count isolate type and compute the position of labels
data = ungroup(status_admix) %>%
dplyr::count(Status, name = "Nb_sample") %>%
dplyr::mutate(Total = sum(Nb_sample)) %>%
arrange(desc(Status)) %>%
mutate(prop = Nb_sample / Total *100) %>%
mutate(ypos = cumsum(prop)- 0.5*prop )
# Basic piechart
ggplot(data, aes(x="", y=prop, fill=Status)) +
geom_bar(stat="identity", width=1, color="white") +
coord_polar("y", start=0) +
theme_void() +
theme(legend.position="right") +
geom_text(aes(y = ypos, label = Nb_sample), color = "black", size=6) +
scale_fill_manual(values = c("#ff9f1c", "#CBF3F0", "#2EC4B6"))
I also want an idea of where the hybrids are located.
# Summary of hybrid category per country
temp = status_admix %>%
group_by(Continent, Country, Main_country_cluster) %>%
dplyr::count(Status)
## As table
kable(pivot_wider(temp, names_from = Status, values_from = n, values_fill = 0))
| Continent | Country | Main_country_cluster | Hybrid | Pure local | Pure other |
|---|---|---|---|---|---|
| Africa | Algeria | V11 | 3 | 5 | 0 |
| Africa | Ethiopia | No_main | 4 | 0 | 0 |
| Africa | Kenya | V2 | 4 | 2 | 0 |
| Africa | Morocco | V11 | 0 | 2 | 0 |
| Africa | Tunisia | V11 | 1 | 14 | 2 |
| Asia | Kazakhstan | No_main | 1 | 0 | 0 |
| Europe | Belarus | No_main | 1 | 0 | 0 |
| Europe | Belgium | V2 | 0 | 14 | 0 |
| Europe | Czech Republic | V2 | 2 | 18 | 0 |
| Europe | Denmark | V2 | 0 | 9 | 0 |
| Europe | France | V2 | 6 | 228 | 4 |
| Europe | Germany | V2 | 1 | 63 | 0 |
| Europe | Hungary | No_main | 2 | 0 | 0 |
| Europe | Ireland | V2 | 0 | 36 | 0 |
| Europe | Italy | V11 | 1 | 2 | 0 |
| Europe | Latvia | V2 | 0 | 2 | 0 |
| Europe | Netherlands | V2 | 2 | 6 | 0 |
| Europe | Poland | V2 | 0 | 5 | 0 |
| Europe | Portugal | No_main | 6 | 0 | 0 |
| Europe | Romania | No_main | 1 | 0 | 0 |
| Europe | Russia | No_main | 2 | 0 | 0 |
| Europe | Spain | V11 | 4 | 2 | 0 |
| Europe | Sweden | V2 | 0 | 6 | 0 |
| Europe | Switzerland | V2 | 2 | 135 | 0 |
| Europe | UK | V2 | 0 | 31 | 0 |
| Europe | Ukraine | No_main | 2 | 0 | 0 |
| Middle East | Iran | V7 | 6 | 16 | 0 |
| Middle East | Israel | V6 | 0 | 39 | 0 |
| Middle East | Syria | V11 | 6 | 1 | 0 |
| Middle East | Turkey | V11 | 10 | 1 | 0 |
| Middle East | Yemen | V6 | 0 | 1 | 0 |
| North America | Canada | V8 | 0 | 11 | 0 |
| North America | Mexico | No_main | 3 | 0 | 0 |
| North America | USA_California | V10 | 0 | 7 | 0 |
| North America | USA_Indiana | V10 | 4 | 7 | 0 |
| North America | USA_Minnesota | V8 | 0 | 2 | 0 |
| North America | USA_Missouri | V10 | 3 | 7 | 0 |
| North America | USA_North Dakota | V8 | 0 | 7 | 0 |
| North America | USA_Ohio | V10 | 0 | 3 | 0 |
| North America | USA_Oregon | V10 | 0 | 145 | 0 |
| North America | USA_Texas | V10 | 1 | 7 | 0 |
| Oceania | Australia_NA | V4 | 1 | 9 | 3 |
| Oceania | Australia_NSW | V4 | 0 | 27 | 0 |
| Oceania | Australia_Tasmania | V1 | 0 | 40 | 0 |
| Oceania | New Zealand | V3 | 24 | 31 | 0 |
| South America | Argentina | V5 | 0 | 16 | 0 |
| South America | Bolivia | V9 | 0 | 1 | 0 |
| South America | Chile | V9 | 0 | 14 | 0 |
| South America | Ecuador | V9 | 1 | 1 | 0 |
| South America | Peru | No_main | 1 | 0 | 0 |
| South America | Uruguay | V5 | 1 | 15 | 0 |
## As a figure
ungroup(temp) %>%
group_by(Country) %>%
dplyr::mutate(Nb_per_country = sum(n)) %>%
filter(Nb_per_country >= 4) %>%
ggplot(aes(x = Country, y = n, fill = Status)) +
geom_bar(stat = "identity", position = "fill") +
theme_cowplot() +
scale_fill_manual(values =c("#2ec4b6", "#EDE7E3", "#cbf3f0")) +
labs(y = "Proportion of isolates")+
facet_grid(rows = vars(Continent), switch = "y", scales = "free_y", space = "free_y") +
theme(axis.title.y = element_blank(),
axis.title.x = element_text(size = 14),
axis.text.x = element_text(size =10),
axis.text.y = element_text(size = 8),
panel.spacing = unit(0, "lines"),
strip.background = element_blank(),
strip.placement = "bottom",
strip.text = element_text(size = 10),
legend.position = "top") +
coord_flip()
#facet_wrap(vars(Continent), scales = "free")
#Table hybrids assigned to several clusters, and pure foreign
temp = inner_join(chosen_coef,
status_admix %>%
filter(Status != "Pure local") %>%
dplyr::select(Sample, Status)) %>%
dplyr::select(Sample, Continent, Country, Latitude, Longitude, Status, Cluster, Admix_coef) %>%
mutate(Admix_coef = ifelse(Admix_coef > 0.2, round(Admix_coef, 4), NA)) %>%
pivot_wider(names_from = Cluster, values_from = Admix_coef) %>%
rowwise() %>%
dplyr::mutate(Count_NA = sum(is.na(c_across(V1:V11)))) %>%
filter(Status == "Pure other" | Count_NA < 10)
opts <- options(knitr.kable.NA = "")
kable(temp, digits = 3, )
| Sample | Continent | Country | Latitude | Longitude | Status | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | V10 | V11 | Count_NA |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Ethiopia_1992_ETD1 | Africa | Ethiopia | 8.730 | 39.010 | Hybrid | 0.281 | 0.567 | 9 | |||||||||
| Indiana_1993_I25a_1 | North America | USA_Indiana | 40.420 | -86.920 | Hybrid | 0.224 | 0.706 | 9 | |||||||||
| Missouri_1994_ST94MO15a_1 | North America | USA_Missouri | 38.890 | -92.210 | Hybrid | 0.221 | 0.698 | 9 | |||||||||
| ST08TN_26_5_4 | Africa | Tunisia | 36.810 | 10.179 | Pure other | 0.805 | 10 | ||||||||||
| ST13FR_Biog_SeptoDur10 | Europe | France | 44.218 | 0.448 | Hybrid | 0.368 | 0.608 | 9 | |||||||||
| ST13FR_Biog_SeptoDur12 | Europe | France | 44.218 | 0.448 | Hybrid | 0.449 | 0.548 | 9 | |||||||||
| ST13FR_Biog_SeptoDur29 | Europe | France | 44.308 | 4.346 | Pure other | 0.765 | 10 | ||||||||||
| ST13FR_Biog_SeptoDur8 | Europe | France | 44.218 | 0.448 | Hybrid | 0.389 | 0.540 | 9 | |||||||||
| ST13FR_Biog_SeptoDur82 | Europe | France | 44.308 | 4.346 | Pure other | 0.834 | 10 | ||||||||||
| ST13FR_Biog_SeptoDur88 | Europe | France | 44.308 | 4.346 | Pure other | 0.880 | 10 | ||||||||||
| ST13IT_Biog_1819 | Europe | Italy | 41.891 | 12.492 | Hybrid | 0.542 | 0.245 | 9 | |||||||||
| ST13NZ_St13_5_4 | Oceania | New Zealand | -41.211 | 174.777 | Hybrid | 0.246 | 0.488 | 9 | |||||||||
| ST15NZ_St_15640_1 | Oceania | New Zealand | -40.134 | 175.658 | Hybrid | 0.270 | 0.562 | 9 | |||||||||
| ST15NZ_St_15640_2 | Oceania | New Zealand | -40.134 | 175.658 | Hybrid | 0.263 | 0.583 | 9 | |||||||||
| ST15NZ_St_15640_7 | Oceania | New Zealand | -40.134 | 175.658 | Hybrid | 0.300 | 0.525 | 9 | |||||||||
| ST15NZ_St_15640_8 | Oceania | New Zealand | -40.134 | 175.658 | Hybrid | 0.257 | 0.570 | 9 | |||||||||
| ST15NZ_St_15642_11 | Oceania | New Zealand | -46.219 | 169.492 | Hybrid | 0.562 | 0.204 | 9 | |||||||||
| ST15NZ_St_15642_12 | Oceania | New Zealand | -46.219 | 169.492 | Hybrid | 0.213 | 0.580 | 9 | |||||||||
| ST15NZ_St_15642_13 | Oceania | New Zealand | -46.219 | 169.492 | Hybrid | 0.220 | 0.593 | 9 | |||||||||
| ST16KA_Biog_K32 | Asia | Kazakhstan | 54.877 | 69.127 | Hybrid | 0.305 | 0.253 | 9 | |||||||||
| ST16RU_Biog_R1 | Europe | Russia | 55.745 | 37.626 | Hybrid | 0.387 | 0.249 | 9 | |||||||||
| ST16RU_Biog_R19 | Europe | Russia | 55.745 | 37.626 | Hybrid | 0.331 | 0.241 | 9 | |||||||||
| ST_SRR2834990 | Oceania | Australia_NA | -35.296 | 149.122 | Pure other | 0.967 | 10 | ||||||||||
| ST_SRR2835000 | Oceania | Australia_NA | -35.296 | 149.122 | Pure other | 0.978 | 10 | ||||||||||
| ST_SRR2835057 | Oceania | Australia_NA | -35.296 | 149.122 | Pure other | 0.915 | 10 | ||||||||||
| STnnJGI_SRR3452689 | Middle East | Turkey | 39.931 | 32.833 | Hybrid | 0.252 | 0.622 | 9 | |||||||||
| STnnJGI_SRR3452696 | North America | Mexico | 19.440 | -99.134 | Hybrid | 0.630 | 0.237 | 9 | |||||||||
| STnnJGI_SRR3452697 | South America | Peru | -12.022 | -77.041 | Hybrid | 0.318 | 0.681 | 9 | |||||||||
| STnnJGI_SRR3452698 | Europe | Portugal | 38.716 | -9.127 | Hybrid | 0.280 | 0.254 | 0.216 | 8 | ||||||||
| STnnJGI_SRR3452699 | Europe | Portugal | 38.716 | -9.127 | Hybrid | 0.310 | 0.205 | 0.253 | 8 | ||||||||
| STnnJGI_SRR3452700 | Africa | Algeria | 36.818 | 3.048 | Hybrid | 0.342 | 0.304 | 9 | |||||||||
| STnnJGI_SRR3452702 | Middle East | Syria | 33.517 | 36.316 | Hybrid | 0.259 | 0.564 | 9 | |||||||||
| STnnJGI_SRR3452704 | Africa | Algeria | 36.818 | 3.048 | Hybrid | 0.338 | 0.302 | 9 | |||||||||
| STnnJGI_SRR3452713 | Europe | France | 48.853 | 2.347 | Pure other | 0.885 | 10 | ||||||||||
| STnnJGI_SRR3452753 | Europe | Hungary | 47.504 | 19.050 | Hybrid | 0.286 | 0.315 | 9 | |||||||||
| STnnJGI_SRR3452770 | Middle East | Syria | 33.517 | 36.316 | Hybrid | 0.319 | 0.635 | 9 | |||||||||
| STnnJGI_SRR3452776 | Africa | Tunisia | 36.810 | 10.179 | Pure other | 0.844 | 10 | ||||||||||
| STnnJGI_SRR3452779 | Europe | Portugal | 38.716 | -9.127 | Hybrid | 0.332 | 0.312 | 9 | |||||||||
| STnnJGI_SRR4235082 | Europe | Portugal | 38.716 | -9.127 | Hybrid | 0.282 | 0.265 | 9 | |||||||||
| STnnJGI_SRR4235083 | Africa | Algeria | 36.818 | 3.048 | Hybrid | 0.352 | 0.236 | 9 | |||||||||
| STnnJGI_SRR4235084 | Oceania | New Zealand | -41.211 | 174.777 | Hybrid | 0.405 | 0.292 | 0.249 | 8 | ||||||||
| STnnJGI_SRR4235085 | Africa | Tunisia | 36.810 | 10.179 | Hybrid | 0.401 | 0.236 | 9 | |||||||||
| STnnJGI_SRR4235086 | North America | Mexico | 19.440 | -99.134 | Hybrid | 0.660 | 0.215 | 9 | |||||||||
| STnnJGI_SRR4235089 | Oceania | New Zealand | -41.211 | 174.777 | Hybrid | 0.406 | 0.234 | 0.265 | 8 | ||||||||
| STnnJGI_SRR4235092 | Europe | Portugal | 38.716 | -9.127 | Hybrid | 0.325 | 0.306 | 0.224 | 8 | ||||||||
| STnnJGI_SRR4235093 | Europe | Portugal | 38.716 | -9.127 | Hybrid | 0.227 | 0.224 | 0.232 | 8 | ||||||||
| STnnJGI_SRR4235113 | Europe | Romania | 44.428 | 26.084 | Hybrid | 0.222 | 0.452 | 9 | |||||||||
| STnnJGI_SRR5194479 | Europe | Spain | 40.416 | -3.708 | Hybrid | 0.322 | 0.218 | 0.216 | 8 | ||||||||
| STnnJGI_SRR5194515 | Europe | Spain | 40.416 | -3.708 | Hybrid | 0.265 | 0.201 | 0.246 | 8 | ||||||||
| STnnJGI_SRR5194596 | Middle East | Iran | 35.727 | 51.385 | Hybrid | 0.222 | 0.384 | 9 | |||||||||
| STnnJGI_SRR5194605 | Europe | Hungary | 47.504 | 19.050 | Hybrid | 0.272 | 0.406 | 9 | |||||||||
| STnnJGI_SRR5194607 | Middle East | Syria | 33.500 | 35.800 | Hybrid | 0.277 | 0.619 | 9 | |||||||||
| STnnJGI_SRR5829674 | Oceania | New Zealand | -41.211 | 174.777 | Hybrid | 0.440 | 0.249 | 0.264 | 8 | ||||||||
| Syria_1992_SYK2 | Middle East | Syria | 33.517 | 36.316 | Hybrid | 0.285 | 0.583 | 9 | |||||||||
| Syria_1992_SYT3 | Middle East | Syria | 36.010 | 36.940 | Hybrid | 0.239 | 0.596 | 9 | |||||||||
| Ukraine_1995_ST95UR_BIc_1 | Europe | Ukraine | 46.430 | 30.690 | Hybrid | 0.308 | 0.304 | 9 | |||||||||
| Ukraine_1995_ST95UR_F1a_3 | Europe | Ukraine | 46.430 | 30.690 | Hybrid | 0.314 | 0.278 | 9 |
write_tsv(x = temp, file = paste0(PopStr_dir, "Hybrids_and_pure_foreign.tab"))
# Simple map of the world
map1 = ggplot() +
theme_void() +
geom_polygon(data = map_data("world"), aes(x=long, y = lat, group = group), fill="#ede7e3") +
theme(panel.border = element_rect(fill=NA, colour = "grey",size = 0.5, linetype = 1))
#Splitting the map into our 3 main focus areas
p1 = map1 + coord_cartesian(xlim=c(-20, 60), ylim=c(20, 70)) +
geom_jitter(data = temp, aes(Longitude, Latitude, col = Status), width = 1, height = 1) +
scale_color_manual(values = c("#ff9f1c", "#2EC4B6"))
p2 = map1 + coord_cartesian(xlim=c(-160, -40), ylim=c(-80, 80)) +
geom_jitter(data = temp, aes(Longitude, Latitude, col = Status), width = 1, height = 1) +
scale_color_manual(values = c("#ff9f1c", "#2EC4B6"))
p3 = map1 + coord_cartesian(xlim=c(115, 175), ylim=c(-65, 10)) +
geom_jitter(data = temp, aes(Longitude, Latitude, col = Status), width = 2, height = 2) +
scale_color_manual(values = c("#ff9f1c", "#2EC4B6"))
#Plotting all the maps together!
aus_map = cowplot::plot_grid(p3 + theme(legend.position = "none"), get_legend(p1), ncol = 2, rel_widths = c(1, 0.9))
ligne = cowplot::plot_grid(p1 + theme(legend.position = "none"), aus_map, ncol = 1, rel_heights = c(1, 0.7))
cowplot::plot_grid(p2 + theme(legend.position = "none"), ligne, ncol = 2, rel_widths = c(0.7, 1))
ggsave(paste0(fig_dir, "Hybrids_scatter_pie.pdf"), width = 18, height = 10, units = "cm")
As a second method to investigate the population structure of Z.tritici at the world-wide scale, I chose to do a principal component analysis based on a subset of the SNPs. The results from the PCA and from the clutering analysis are coherent with each other: Oceania separates into 3 clusters (one in New_Zealand, and two in Australia) and the North American isolates form two separate clusters. Higher K values also distinguish a Middle-Eastern/African cluster from the European cluster, representing the two extreme points of the gradient found between these populations in the PCA. Despite the high number of isolates from Europe, it’s interesting to see that no clustering appears there.
snpgdsVCF2GDS(paste0(vcf_dir, vcf_name, ".recode.vcf"),
paste0(PopStr_dir, vcf_name, ".recode.gds"), method="biallelic.only")
genofile <- snpgdsOpen(paste0(PopStr_dir, vcf_name, ".recode.gds"))
pca <-snpgdsPCA(genofile)
snpgdsClose(genofile)
pca2 = as_tibble(pca$eigenvect) %>% dplyr::select(V1:V8)
colnames(pca2) = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8")
pca2 = pca2 %>%
dplyr::mutate(sample_id = pca$sample.id ) %>%
dplyr::right_join(., status_admix, by = c("sample_id" = "Sample")) %>%
filter(!is.na(PC1))
#Writing table to store PCA results
pca2 %>%
dplyr::select(ID_file = sample_id, PC1, PC2, PC3, PC4, PC5, PC6, PC7, PC8,
Continent, Country, Latitude, Longitude) %>%
write_tsv(., file = paste0(PopStr_dir, vcf_name, ".PCA_results.tab"),
col_names = T)
#
as.tibble(pca$eigenval[!is.na(pca$eigenval)]) %>%
ggplot(aes(x = c(1:length(pca$eigenval[!is.na(pca$eigenval)])),
y =value)) +
geom_point() +
theme_bw() +
labs(y = "Eigenvalue", x = "Principal component")
eigen_sum = sum(pca$eigenval[!is.na(pca$eigenval)])
for_plot = pca2 %>% mutate(Cluster = ifelse(Status == "Hybrid", Status, Cluster))
#Defining colors per cluster based on cluster
myColors2 <- c("grey", "#129eba", "#3F6E0C", "#DA4167", "#ffba0a", "#129eba", "#129eba",
"#8fa253", "#A20106", "#A20106", "#3F6E0C", "#8fa253")
myShapes <- c(1, 1, 1, 1, 1, 2, 0,
1, 1, 0, 0, 0)
names(myColors2) = levels(factor(for_plot$Cluster))
names(myShapes) = levels(factor(for_plot$Cluster))
Color_Cluster = ggplot2::scale_colour_manual(name = "Cluster", values = myColors2)
Fill_Cluster = ggplot2::scale_fill_manual(name = "Cluster", values = myColors2)
Shape_Cluster = ggplot2::scale_shape_manual(name = "Cluster", values = myShapes)
## Detailed color scheme
K_colors = c("grey",
"#8ECAE6", #V1 Aus (TAS)
"#49810E", #V10 USA
"#E16684", #V11 North Africa
"#FFBA0A", #V2 Europe
"#126782", #V3 NZ
"#219EBC", #V4 Australia (NSW)
"#8FA253", #V5 Uruguay + Argentina
"#650104", #V6 Israel + Turkey
"#DE020A", #V7 Iran
"#2A4908", #V8 Canada
"#B3C186") #V9 Boliva + Chile + Ecuador
names(K_colors) = levels(factor(for_plot$Cluster))
Color_Cluster2 = ggplot2::scale_colour_manual(name = "Cluster", values = K_colors)
Fill_Cluster2 = ggplot2::scale_fill_manual(name = "Cluster", values = K_colors)
# Dot plots
p1 = ggplot(for_plot, aes(x = PC1, y= PC2, shape = Cluster)) +
geom_point(aes(color = Cluster)) +
labs(x = paste0("PC 1 (", round(pca$eigenval[1]*100/eigen_sum, 2), "%)"),
y = paste0("PC 2 (", round(pca$eigenval[2]*100/eigen_sum, 2), "%)")) +
Color_Cluster + Shape_Cluster +
theme_bw()
p2 = ggplot(for_plot, aes(x = PC3, y= PC4, shape = Cluster)) +
geom_point(aes(color = Cluster)) +
labs(x = paste0("PC 3 (", round(pca$eigenval[3]*100/eigen_sum, 2), "%)"),
y = paste0("PC 4 (", round(pca$eigenval[4]*100/eigen_sum, 2), "%)")) +
Color_Cluster + Shape_Cluster +
theme_bw()
p3 = ggplot(for_plot, aes(x = PC5, y= PC6, shape = Cluster)) +
geom_point(aes(color = Cluster)) +
labs(x = paste0("PC 5 (", round(pca$eigenval[5]*100/eigen_sum, 2), "%)"),
y = paste0("PC 6 (", round(pca$eigenval[6]*100/eigen_sum, 2), "%)")) +
Color_Cluster + Shape_Cluster +
theme_bw()
p4 = ggplot(for_plot, aes(x = PC7, y= PC8, shape = Cluster)) +
geom_point(aes(color = Cluster)) +
labs(x = paste0("PC 7 (", round(pca$eigenval[7]*100/eigen_sum, 2), "%)"),
y = paste0("PC 8 (", round(pca$eigenval[8]*100/eigen_sum, 2), "%)")) +
Color_Cluster + Shape_Cluster +
theme_bw()
PCA_grid = cowplot::plot_grid(p1 + theme(legend.position = "none"), p2 + theme(legend.position = "none"),
p3 + theme(legend.position = "none"), p4 + theme(legend.position = "none"))
cowplot::plot_grid(PCA_grid, get_legend(p1 + theme(legend.position = "bottom")), ncol =1, rel_heights = c(1, 0.2))
p11 = filter(for_plot, Cluster != "Hybrid") %>%
ggplot(aes(PC1, color = Cluster, fill = Cluster)) +
geom_density(alpha = .3)+
Color_Cluster + Fill_Cluster +
theme_cowplot() +
theme(axis.title = element_blank(), axis.text = element_blank(),
plot.margin = unit(c(0,0,0,0), "cm"), axis.ticks = element_blank())
p12 = filter(for_plot, Cluster != "Hybrid") %>%
ggplot(aes(PC2, color = Cluster, fill = Cluster)) +
geom_density(alpha = .3)+
Color_Cluster + Fill_Cluster +
theme_cowplot() +
theme(axis.title = element_blank(), axis.text = element_blank(),
plot.margin = unit(c(0,0,0,0), "cm"), axis.ticks = element_blank())
p13 = filter(for_plot, Cluster != "Hybrid") %>%
ggplot(aes(PC3, color = Cluster, fill = Cluster)) +
geom_density(alpha = .3)+
Color_Cluster + Fill_Cluster +
theme_cowplot() +
theme(axis.title = element_blank(), axis.text = element_blank(),
plot.margin = unit(c(0,0,0,0), "cm"), axis.ticks = element_blank())
cowplot::plot_grid(p11+ theme(legend.position = "none"),
p12 + theme(legend.position = "none"),
p13 + theme(legend.position = "none"),
ncol = 1, align = "hv")
ggMarginal(ggplot(for_plot, aes(x = PC2, y= PC3, shape = Cluster)) +
geom_point(aes(color = Cluster)) +
labs(x = paste0("PC 2 (", round(pca$eigenval[2]*100/eigen_sum, 2), "%)"),
y = paste0("PC 3 (", round(pca$eigenval[3]*100/eigen_sum, 2), "%)")) +
Color_Cluster + Shape_Cluster +
theme_bw() + theme(legend.position = "bottom"), groupColour = TRUE, groupFill = TRUE)
ggplot2::theme_set(theme_light())
p = ggpairs(pca2, columns = c(1:6),
ggplot2::aes(col=Continent, fill = Continent, alpha = 0.6),
title = "PCA based thinned SNPs",
upper = list(continuous = "points", combo = "box_no_facet"))
for(i in 1:p$nrow) {
for(j in 1:p$ncol){
p[i,j] <- p[i,j] + theme_light() + Color_Continent + Fill_Continent
}
}
p
In the steps before, I have learned about population history indirectly by inferring genetic populations from the genomic data. The relationship between the population and the underlying demography is not explicit in these however. It is possible however to infer splits between populations and create a population tree. Here, I use treemix, which takes into account the possibility of gene flow between populations and indeed test of it in the process of creating a population tree.
Because the populations in the clustering were not perfectly distinct from one another, I start with “discretized” populations by choosing only the isolates with high ancestry in one of the sNMF clusters.
#With only Zymoseptoria tritici (but more markers)
~/Documents/Software/vcftools_jydu/src/cpp/vcftools \
--vcf ${VCFDIR}$VCFNAME.recode.vcf \
--keep ${POPSTR}$VCFNAME.high_anc_coef_snmf.ind \
--remove-filtered-all --extract-FORMAT-info GT \
--max-missing 1.0 --min-alleles 2 --max-alleles 2 \
--maf 0.05 \
--out ${POPSTR}$VCFNAME.high_anc_coef_snmf
cat ${POPSTR}$VCFNAME.high_anc_coef_snmf.GT.FORMAT | cut -f 3- \
> ${POPSTR}$VCFNAME.high_anc_coef_snmf.GT.FORMAT2
#With only the positions found in the outgroup
~/Documents/Software/vcftools_jydu/src/cpp/vcftools \
--vcf ${VCFDIR}$VCFNAME.recode.vcf \
--keep ${POPSTR}$VCFNAME.high_anc_coef_snmf.ind \
--remove-filtered-all --max-missing 1.0 --min-alleles 2 --max-alleles 2 --maf 0.05 \
--positions ${VCFDIR}est_sfs_position_list.for_vcftools \
--extract-FORMAT-info GT \
--out ${POPSTR}High_anc_coef_only_outgroups_positions
In the treemix analysis, I want to use the sequences from the strains Zpa63 and Za17, respectively from the species Zymoseptoria passerinii and Zymoseptoria ardabilia, as outgroups. Here is how I went from the fully assembled sequences to the snps to add to the Z. tritici intra-species variants. There are also some data wrangling and table reformatting steps to get the variants ready for Treemix.
#For both Zpa63 and Za17
./dnadiff \
-p /data2/alice/WW_project/1_Variant_calling/3_Per_sample_calling/Zpa63 \
/data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.fa \
/legserv/NGS_data/Zymoseptoria/Zt_Reference_genomes/Sister_species_Feurtey2020/Zpa63_softmasked_for_publication.fa
./delta-filter \
-1 /data2/alice/WW_project/1_Variant_calling/3_Per_sample_calling/Zpa63.delta \
> /data2/alice/WW_project/1_Variant_calling/3_Per_sample_calling/Zpa63.filtered.delta
./show-coords -r -T /data2/alice/WW_project/1_Variant_calling/3_Per_sample_calling/Zpa63.filtered.delta | \
awk 'BEGIN {OFS = "\t"} NR > 4 {print $8, $1, $2}' \
> /data2/alice/WW_project/1_Variant_calling/3_Per_sample_calling/Zpa63.filtered.bed
# Getting the single alignments for the two species
~/Software/bedtools intersect \
-a /data2/alice/WW_project/1_Variant_calling/3_Per_sample_calling/Za17.filtered.bed \
-b /data2/alice/WW_project/1_Variant_calling/3_Per_sample_calling/Zpa63.filtered.bed \
> /data2/alice/WW_project/1_Variant_calling/3_Per_sample_calling/Za17_filtered_insersect_Zpa63_filtered.bed
# Filtering the snps to keep only the ones in the good intervals
~/Software/bedtools intersect -a /data2/alice/WW_project/1_Variant_calling/3_Per_sample_calling/Zpa63.snps ^C
awk 'BEGIN {OFS = "\t"} {print $11, $1, $1, $2, $3}' \
/data2/alice/WW_project/1_Variant_calling/3_Per_sample_calling/Zpa63.snps \
> /data2/alice/WW_project/1_Variant_calling/3_Per_sample_calling/Zpa63.snps.bed ;
echo -e "CHROM\tPOS\tREF\tZpa63" > /data2/alice/WW_project/1_Variant_calling/3_Per_sample_calling/Zpa63.filtered.snps
~/Software/bedtools intersect \
-a /data2/alice/WW_project/1_Variant_calling/3_Per_sample_calling/Zpa63.snps.bed \
-b /data2/alice/WW_project/1_Variant_calling/3_Per_sample_calling/Za17_filtered_insersect_Zpa63_filtered.bed \
-wa | cut -f 1,2,4,5 >> /data2/alice/WW_project/1_Variant_calling/3_Per_sample_calling/Zpa63.filtered.snps
awk 'BEGIN {OFS = "\t"} {print $11, $1, $1, $2, $3}' \
/data2/alice/WW_project/1_Variant_calling/3_Per_sample_calling/Za17.snps \
> /data2/alice/WW_project/1_Variant_calling/3_Per_sample_calling/Za17.snps.bed ;
echo -e "CHROM\tPOS\tREF\tZa17" > /data2/alice/WW_project/1_Variant_calling/3_Per_sample_calling/Za17.filtered.snps
~/Software/bedtools intersect \
-a /data2/alice/WW_project/1_Variant_calling/3_Per_sample_calling/Za17.snps.bed \
-b /data2/alice/WW_project/1_Variant_calling/3_Per_sample_calling/Za17_filtered_insersect_Zpa63_filtered.bed \
-wa | cut -f 1,2,4,5 >> /data2/alice/WW_project/1_Variant_calling/3_Per_sample_calling/Za17.filtered.snps
# Filtering the counts from the whole vcf to keep only the ones in the good intervals
#!/bin/bash
source $1
${VCFTOOLS_PATH} \
--gzvcf ${vcf_dir}${VCFBasename}.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.max-m-80.vcf.gz \
--min-alleles 2 --max-alleles 2 --mac 1 --remove-indels \
--out ${vcf_dir}${VCFBasename}.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.max-m-80.biall_SNPs.intersect_Za_Zpa \
--counts --bed /data2/alice/WW_project/1_Variant_calling/3_Per_sample_calling/Za17_filtered_insersect_Zpa63_filtered.bed
#Read outgroup SNPS
temp = inner_join( read_delim(paste0(VAR_dir, "Za17.filtered.snps")) %>%
filter(Za17 != "." & REF != ".") %>% distinct(),
read_delim(paste0(VAR_dir, "Zpa63.filtered.snps")) %>%
filter(Zpa63 != "." & REF != ".") %>% distinct())
#Merge to previously created GT table
data_for_treemix = inner_join(read_tsv(paste0(PopStr_dir, "High_anc_coef_only_outgroups_positions.alleles_info"),
col_names = c("CHROM", "POS","REF", "ALT")),
read_tsv(paste0(PopStr_dir, "High_anc_coef_only_outgroups_positions.GT.FORMAT"))) %>%
left_join(temp)
# This was simply to check that everything was matching as it should
#temp = cbind(read_delim(paste0(VAR_dir, "est_sfs_position_list.txt"), delim = "\t"),
# read_delim(paste0(VAR_dir, "est_sfs_pval.txt"), delim = " ", skip = 7,
# col_names = c("Line_nb", "Conf_index", "Probability")) %>%
# dplyr::select(Line_nb, Probability)) %>%
#mutate(Ancestral_allele = ifelse(Probability > 0.95, Major_allele, ifelse(Probability < 0.05, Minor_allele, "Uncertain")))
#data_for_treemix = data_for_treemix %>% left_join(temp)
# REmoving positions with third allele
# Changing the format to 0/1
data_for_treemix = data_for_treemix %>%
filter(is.na(Za17) | Za17 == ALT) %>%
filter(is.na(Zpa63) | Zpa63 == ALT) %>%
dplyr::mutate(Za17 = ifelse(is.na(Za17), 0, 1),
Zpa63 = ifelse(is.na(Zpa63), 0, 1))
data_for_treemix %>% dplyr::select(-CHROM, -POS, -REF, -ALT) %>%
write_delim(paste0(PopStr_dir, "High_anc_coef_only_outgroups_positions.with_outgroups.GT.FORMAT2"), delim = "\t")
#conda activate r-reticulate
from collections import defaultdict
#For each isolate, store its pop (as in sampling site) in a dictionary
dict_pop = dict(zip(list(r.high_anc_coef_snmf["Sample"]) + ["Za17", "Zpa63"],
list(r.high_anc_coef_snmf["Cluster"]) + ["Za17", "Zpa63"]))
#Keep a list of the pop names/coordinates to write in the same order later
all_pops = sorted(list(set(list(r.high_anc_coef_snmf["Cluster"])+ ["Za17", "Zpa63"])))
#out_name = r.PopStr_dir + r.vcf_name + ".high_anc_coef_snmf.treemix"
out_name = "/Users/afeurtey/Documents/Postdoc_Bruce/Projects/WW_project/2_Population_structure/High_anc_coef_only_outgroups_positions.with_outgroups.treemix"
out = open(out_name, "w")
shutup = out.write(" ".join(all_pops) + "\n")
#with open(r.PopStr_dir + r.vcf_name + ".high_anc_coef_snmf.GT.FORMAT2", "r") as input_snps :
with open("/Users/afeurtey/Documents/Postdoc_Bruce/Projects/WW_project/2_Population_structure/High_anc_coef_only_outgroups_positions.with_outgroups.GT.FORMAT2", "r") as input_snps :
for i, snp in enumerate(input_snps) :
#Setting two dictionaries with values at 0
dict_snp0 = defaultdict(int)
dict_snp1 = defaultdict(int)
Lets_write = True
#The first line is the name of the isolates
if i == 0 :
indv = snp.strip().split("\t")
Lets_write = False
else :
#Keeping isolate name and allelic value together
alleles = zip(indv, snp.strip().split("\t"))
#...and counting the O and 1 based on the pop
for ind, allele in alleles:
if allele == "0" :
dict_snp0[dict_pop[ind]] += 1
elif allele == "1" :
dict_snp1[dict_pop[ind]] += 1
else :
print("Only biallelic please!!!!")
Lets_write = False
#If I have not found anything weird, I will write the result to the output file.
if Lets_write :
shutup = out.write(" ".join([",".join([str(dict_snp0[pop]), str(dict_snp1[pop])]) for pop in all_pops]) + "\n")
print("All done!")
out.close()
Once the tables are properly formatted, it’s time to run the treemix software and to graphically explore the results.
POPSTR="/Users/afeurtey/Documents/Postdoc_Bruce/Projects/WW_project/2_Population_structure/"
#With outgroup
if [ -f ${POPSTR}High_anc_coef_only_outgroups_positions.with_outgroups ] ;
then
gzip ${POPSTR}High_anc_coef_only_outgroups_positions.with_outgroups.treemix
fi
treemix \
-i ${POPSTR}High_anc_coef_only_outgroups_positions.with_outgroups.treemix.gz \
-o ${POPSTR}High_anc_coef_only_outgroups_positions.with_outgroups.m0.treemix.out \
-m 0 -root Zpa63
treemix \
-i ${POPSTR}High_anc_coef_only_outgroups_positions.with_outgroups.treemix.gz \
-o ${POPSTR}High_anc_coef_only_outgroups_positions.with_outgroups.m1.treemix.out \
-m 1 -root Zpa63
treemix \
-i ${POPSTR}High_anc_coef_only_outgroups_positions.with_outgroups.treemix.gz \
-o ${POPSTR}High_anc_coef_only_outgroups_positions.with_outgroups.m2.treemix.out \
-m 2 -root Zpa63
treemix \
-i ${POPSTR}High_anc_coef_only_outgroups_positions.with_outgroups.treemix.gz \
-o ${POPSTR}High_anc_coef_only_outgroups_positions.with_outgroups.m3.treemix.out \
-m 3 -root Zpa63
treemix \
-i ${POPSTR}High_anc_coef_only_outgroups_positions.with_outgroups.treemix.gz \
-o ${POPSTR}High_anc_coef_only_outgroups_positions.with_outgroups.m4.treemix.out \
-m 4 -root Zpa63
treemix \
-i ${POPSTR}High_anc_coef_only_outgroups_positions.with_outgroups.treemix.gz \
-o ${POPSTR}High_anc_coef_only_outgroups_positions.with_outgroups.m5.treemix.out \
-m 5 -root Zpa63
treemix \
-i ${POPSTR}High_anc_coef_only_outgroups_positions.with_outgroups.treemix.gz \
-o ${POPSTR}High_anc_coef_only_outgroups_positions.with_outgroups.m6.treemix.out \
-m 6 -root Zpa63
treemix \
-i ${POPSTR}High_anc_coef_only_outgroups_positions.with_outgroups.treemix.gz \
-o ${POPSTR}High_anc_coef_only_outgroups_positions.with_outgroups.m7.treemix.out \
-m 7 -root Zpa63
source("~/Documents/Software/treemix-1.13/src/plotting_funcs.R")
#p_cluster
t = plot_tree(paste0(PopStr_dir, "High_anc_coef_only_outgroups_positions.with_outgroups.m0.treemix.out"))
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
## 1 1 Zpa63 NOT_ROOT NOT_MIG TIP 305 NA NA NA NA
## 2 2 <NA> NOT_ROOT NOT_MIG NOT_TIP 305 4 1 32 11
## 3 3 V6 NOT_ROOT NOT_MIG TIP 16 NA NA NA NA
## 4 4 Za17 NOT_ROOT NOT_MIG TIP 2 NA NA NA NA
## 5 15 V7 NOT_ROOT NOT_MIG TIP 16 NA NA NA NA
## 6 16 <NA> NOT_ROOT NOT_MIG NOT_TIP 172 15 1 3 1
## 7 31 V4 NOT_ROOT NOT_MIG TIP 136 NA NA NA NA
## 8 32 <NA> NOT_ROOT NOT_MIG NOT_TIP 2 172 3 104 8
## 9 51 V10 NOT_ROOT NOT_MIG TIP 76 NA NA NA NA
## 10 52 <NA> NOT_ROOT NOT_MIG NOT_TIP 104 76 2 256 4
## 11 75 V8 NOT_ROOT NOT_MIG TIP 76 NA NA NA NA
## 12 76 <NA> NOT_ROOT NOT_MIG NOT_TIP 52 51 1 75 1
## 13 103 V9 NOT_ROOT NOT_MIG TIP 304 NA NA NA NA
## 14 104 <NA> NOT_ROOT NOT_MIG NOT_TIP 32 52 6 304 2
## 15 135 V1 NOT_ROOT NOT_MIG TIP 136 NA NA NA NA
## 16 136 <NA> NOT_ROOT NOT_MIG NOT_TIP 212 31 1 135 1
## 17 171 V11 NOT_ROOT NOT_MIG TIP 172 NA NA NA NA
## 18 172 <NA> NOT_ROOT NOT_MIG NOT_TIP 32 16 2 171 1
## 19 211 V3 NOT_ROOT NOT_MIG TIP 212 NA NA NA NA
## 20 212 <NA> NOT_ROOT NOT_MIG NOT_TIP 256 136 2 211 1
## 21 255 V2 NOT_ROOT NOT_MIG TIP 256 NA NA NA NA
## 22 256 <NA> NOT_ROOT NOT_MIG NOT_TIP 52 212 3 255 1
## 23 303 V5 NOT_ROOT NOT_MIG TIP 304 NA NA NA NA
## 24 304 <NA> NOT_ROOT NOT_MIG NOT_TIP 104 303 1 103 1
## 25 305 <NA> ROOT NOT_MIG NOT_TIP 305 1 1 2 12
## V11
## 1 Zpa63:5.733e-07
## 2 (Za17:1.1466e-06,(((V7:0.0253622,V6:0.00582189):0.011378,V11:0.0234466):0.00913495,(((V10:0.0294537,V8:0.0457607):0.00834699,(((V4:0.0575342,V1:0.0439875):0.0107982,V3:0.0339801):0.0115416,V2:0):0.00477896):0.00342986,(V5:0.0106011,V9:0.0319032):0.00930602):0.00846123):0.182701):5.733e-07
## 3 V6:0.00582189
## 4 Za17:1.1466e-06
## 5 V7:0.0253622
## 6 (V7:0.0253622,V6:0.00582189):0.011378
## 7 V4:0.0575342
## 8 (((V7:0.0253622,V6:0.00582189):0.011378,V11:0.0234466):0.00913495,(((V10:0.0294537,V8:0.0457607):0.00834699,(((V4:0.0575342,V1:0.0439875):0.0107982,V3:0.0339801):0.0115416,V2:0):0.00477896):0.00342986,(V5:0.0106011,V9:0.0319032):0.00930602):0.00846123):0.182701
## 9 V10:0.0294537
## 10 ((V10:0.0294537,V8:0.0457607):0.00834699,(((V4:0.0575342,V1:0.0439875):0.0107982,V3:0.0339801):0.0115416,V2:0):0.00477896):0.00342986
## 11 V8:0.0457607
## 12 (V10:0.0294537,V8:0.0457607):0.00834699
## 13 V9:0.0319032
## 14 (((V10:0.0294537,V8:0.0457607):0.00834699,(((V4:0.0575342,V1:0.0439875):0.0107982,V3:0.0339801):0.0115416,V2:0):0.00477896):0.00342986,(V5:0.0106011,V9:0.0319032):0.00930602):0.00846123
## 15 V1:0.0439875
## 16 (V4:0.0575342,V1:0.0439875):0.0107982
## 17 V11:0.0234466
## 18 ((V7:0.0253622,V6:0.00582189):0.011378,V11:0.0234466):0.00913495
## 19 V3:0.0339801
## 20 ((V4:0.0575342,V1:0.0439875):0.0107982,V3:0.0339801):0.0115416
## 21 V2:0
## 22 (((V4:0.0575342,V1:0.0439875):0.0107982,V3:0.0339801):0.0115416,V2:0):0.00477896
## 23 V5:0.0106011
## 24 (V5:0.0106011,V9:0.0319032):0.00930602
## 25 (Zpa63:5.733e-07,(Za17:1.1466e-06,(((V7:0.0253622,V6:0.00582189):0.011378,V11:0.0234466):0.00913495,(((V10:0.0294537,V8:0.0457607):0.00834699,(((V4:0.0575342,V1:0.0439875):0.0107982,V3:0.0339801):0.0115416,V2:0):0.00477896):0.00342986,(V5:0.0106011,V9:0.0319032):0.00930602):0.00846123):0.182701):5.733e-07);
## x y ymin ymax
## 1 0.0000005733 0.96153846 0.92307692 1.00000000
## 2 0.0000005733 0.84615385 0.00000000 0.92307692
## 3 0.2090364133 0.73076923 0.69230769 0.76923077
## 4 0.0000017199 0.88461538 0.84615385 0.92307692
## 5 0.2285767233 0.80769231 0.76923077 0.84615385
## 6 0.2032145233 0.76923077 0.69230769 0.84615385
## 7 0.2792456233 0.42307692 0.38461538 0.46153846
## 8 0.1827015733 0.61538462 0.00000000 0.84615385
## 9 0.2323933533 0.57692308 0.53846154 0.61538462
## 10 0.1945926633 0.46153846 0.15384615 0.61538462
## 11 0.2487003533 0.50000000 0.46153846 0.53846154
## 12 0.2029396533 0.53846154 0.46153846 0.61538462
## 13 0.2323720233 0.03846154 0.00000000 0.07692308
## 14 0.1911628033 0.15384615 0.00000000 0.61538462
## 15 0.2656989233 0.34615385 0.30769231 0.38461538
## 16 0.2217114233 0.38461538 0.30769231 0.46153846
## 17 0.2152831233 0.65384615 0.61538462 0.69230769
## 18 0.1918365233 0.69230769 0.61538462 0.84615385
## 19 0.2448933233 0.26923077 0.23076923 0.30769231
## 20 0.2109132233 0.30769231 0.23076923 0.46153846
## 21 0.1993716233 0.19230769 0.15384615 0.23076923
## 22 0.1993716233 0.23076923 0.15384615 0.46153846
## 23 0.2110699233 0.11538462 0.07692308 0.15384615
## 24 0.2004688233 0.07692308 0.00000000 0.15384615
## 25 0.0000000000 0.92307692 0.00000000 1.00000000
## [1] 0.0000005733 0.2090364133 0.0000017199 0.2285767233 0.2792456233
## [6] 0.2323933533 0.2487003533 0.2323720233 0.2656989233 0.2152831233
## [11] 0.2448933233 0.1993716233 0.2110699233
## [1] 0.003
## [1] "mse 0.000965112171597633"
t = plot_tree(paste0(PopStr_dir, "High_anc_coef_only_outgroups_positions.with_outgroups.m1.treemix.out"))
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
## 1 1 Zpa63 NOT_ROOT NOT_MIG TIP 305 NA NA NA NA
## 2 2 <NA> NOT_ROOT NOT_MIG NOT_TIP 305 4 1 32 11
## 3 3 V6 NOT_ROOT NOT_MIG TIP 16 NA NA NA NA
## 4 4 Za17 NOT_ROOT NOT_MIG TIP 2 NA NA NA NA
## 5 15 V7 NOT_ROOT NOT_MIG TIP 16 NA NA NA NA
## 6 16 <NA> NOT_ROOT NOT_MIG NOT_TIP 172 15 1 3 1
## 7 31 V4 NOT_ROOT NOT_MIG TIP 136 NA NA NA NA
## 8 32 <NA> NOT_ROOT NOT_MIG NOT_TIP 2 104 8 172 3
## 9 51 V10 NOT_ROOT NOT_MIG TIP 76 NA NA NA NA
## 10 52 <NA> NOT_ROOT NOT_MIG NOT_TIP 104 76 2 256 4
## 11 75 V8 NOT_ROOT NOT_MIG TIP 76 NA NA NA NA
## 12 76 <NA> NOT_ROOT NOT_MIG NOT_TIP 52 51 1 75 1
## 13 103 V9 NOT_ROOT NOT_MIG TIP 304 NA NA NA NA
## 14 104 <NA> NOT_ROOT NOT_MIG NOT_TIP 32 52 6 304 2
## 15 135 V1 NOT_ROOT NOT_MIG TIP 136 NA NA NA NA
## 16 136 <NA> NOT_ROOT NOT_MIG NOT_TIP 212 31 1 135 1
## 17 171 V11 NOT_ROOT NOT_MIG TIP 172 NA NA NA NA
## 18 172 <NA> NOT_ROOT NOT_MIG NOT_TIP 32 16 2 171 1
## 19 211 V3 NOT_ROOT NOT_MIG TIP 212 NA NA NA NA
## 20 212 <NA> NOT_ROOT NOT_MIG NOT_TIP 256 211 1 136 2
## 21 255 V2 NOT_ROOT NOT_MIG TIP 256 NA NA NA NA
## 22 256 <NA> NOT_ROOT NOT_MIG NOT_TIP 52 255 1 212 3
## 23 303 V5 NOT_ROOT NOT_MIG TIP 304 NA NA NA NA
## 24 304 <NA> NOT_ROOT NOT_MIG NOT_TIP 104 303 1 103 1
## 25 305 <NA> ROOT NOT_MIG NOT_TIP 305 1 1 2 12
## 26 321 <NA> NOT_ROOT MIG NOT_TIP 32 172 NA NA NA
## V11
## 1 Zpa63:5.733e-07
## 2 (Za17:1.1466e-06,((((V10:0.0138305,V8:0.181607):0.0229393,(V2:0,(V3:0.0339801,(V4:0.0575342,V1:0.0439875):0.0107982):0.0115416):0.000971518):0.00674799,(V5:0.0106011,V9:0.0319032):0.00826177):0.00947709,((V7:0.0253622,V6:0.00582189):0.011378,V11:0.0234466):0.00951656):0.182304):5.733e-07
## 3 V6:0.00582189
## 4 Za17:1.1466e-06
## 5 V7:0.0253622
## 6 (V7:0.0253622,V6:0.00582189):0.011378
## 7 V4:0.0575342
## 8 ((((V10:0.0138305,V8:0.181607):0.0229393,(V2:0,(V3:0.0339801,(V4:0.0575342,V1:0.0439875):0.0107982):0.0115416):0.000971518):0.00674799,(V5:0.0106011,V9:0.0319032):0.00826177):0.00947709,((V7:0.0253622,V6:0.00582189):0.011378,V11:0.0234466):0.00951656):0.182304
## 9 V10:0.0138305
## 10 ((V10:0.0138305,V8:0.181607):0.0229393,(V2:0,(V3:0.0339801,(V4:0.0575342,V1:0.0439875):0.0107982):0.0115416):0.000971518):0.00674799
## 11 V8:0.181607
## 12 (V10:0.0138305,V8:0.181607):0.0229393
## 13 V9:0.0319032
## 14 (((V10:0.0138305,V8:0.181607):0.0229393,(V2:0,(V3:0.0339801,(V4:0.0575342,V1:0.0439875):0.0107982):0.0115416):0.000971518):0.00674799,(V5:0.0106011,V9:0.0319032):0.00826177):0.00947709
## 15 V1:0.0439875
## 16 (V4:0.0575342,V1:0.0439875):0.0107982
## 17 V11:0.0234466
## 18 ((V7:0.0253622,V6:0.00582189):0.011378,V11:0.0234466):0.00951656
## 19 V3:0.0339801
## 20 (V3:0.0339801,(V4:0.0575342,V1:0.0439875):0.0107982):0.0115416
## 21 V2:0
## 22 (V2:0,(V3:0.0339801,(V4:0.0575342,V1:0.0439875):0.0107982):0.0115416):0.000971518
## 23 V5:0.0106011
## 24 (V5:0.0106011,V9:0.0319032):0.00826177
## 25 (Zpa63:5.733e-07,(Za17:1.1466e-06,((((V10:0.0138305,V8:0.181607):0.0229393,(V2:0,(V3:0.0339801,(V4:0.0575342,V1:0.0439875):0.0107982):0.0115416):0.000971518):0.00674799,(V5:0.0106011,V9:0.0319032):0.00826177):0.00947709,((V7:0.0253622,V6:0.00582189):0.011378,V11:0.0234466):0.00951656):0.182304):5.733e-07);
## 26 <NA>
## x y ymin ymax
## 1 0.0000005733 0.96153846 0.92307692 1.00000000
## 2 0.0000005733 0.84615385 0.00000000 0.92307692
## 3 0.2090210233 0.11538462 0.07692308 0.15384615
## 4 0.0000017199 0.88461538 0.84615385 0.92307692
## 5 0.2285613333 0.19230769 0.15384615 0.23076923
## 6 0.2031991333 0.15384615 0.07692308 0.23076923
## 7 0.2793751713 0.50000000 0.46153846 0.53846154
## 8 0.1823045733 0.23076923 0.00000000 0.84615385
## 9 0.2352994533 0.80769231 0.76923077 0.84615385
## 10 0.1985296533 0.69230769 0.38461538 0.84615385
## 11 0.2728008709 0.73076923 0.69230769 0.76923077
## 12 0.2214689533 0.76923077 0.69230769 0.84615385
## 13 0.2319466333 0.26923077 0.23076923 0.30769231
## 14 0.1917816633 0.38461538 0.23076923 0.84615385
## 15 0.2658284713 0.42307692 0.38461538 0.46153846
## 16 0.2218409713 0.46153846 0.38461538 0.53846154
## 17 0.2152677333 0.03846154 0.00000000 0.07692308
## 18 0.1918211333 0.07692308 0.00000000 0.23076923
## 19 0.2450228713 0.57692308 0.53846154 0.61538462
## 20 0.2110427713 0.53846154 0.38461538 0.61538462
## 21 0.1995011713 0.65384615 0.61538462 0.69230769
## 22 0.1995011713 0.61538462 0.38461538 0.69230769
## 23 0.2106445333 0.34615385 0.30769231 0.38461538
## 24 0.2000434333 0.30769231 0.23076923 0.38461538
## 25 0.0000000000 0.92307692 0.00000000 1.00000000
## 26 0.1889662133 NA 0.00000000 0.23076923
## [1] "0.700005 0.1823045733 0.1918211333"
## [1] 0.0000005733 0.2090210233 0.0000017199 0.2285613333 0.2793751713
## [6] 0.2352994533 0.2728008709 0.2319466333 0.2658284713 0.2152677333
## [11] 0.2450228713 0.1995011713 0.2106445333
## [1] 0.003
## [1] "mse 0.000965112171597633"
t = plot_tree(paste0(PopStr_dir, "High_anc_coef_only_outgroups_positions.with_outgroups.m2.treemix.out"))
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
## 1 1 Zpa63 NOT_ROOT NOT_MIG TIP 305 NA NA NA NA
## 2 2 <NA> NOT_ROOT NOT_MIG NOT_TIP 305 4 1 32 11
## 3 3 V6 NOT_ROOT NOT_MIG TIP 16 NA NA NA NA
## 4 4 Za17 NOT_ROOT NOT_MIG TIP 2 NA NA NA NA
## 5 15 V7 NOT_ROOT NOT_MIG TIP 16 NA NA NA NA
## 6 16 <NA> NOT_ROOT NOT_MIG NOT_TIP 172 15 1 3 1
## 7 31 V4 NOT_ROOT NOT_MIG TIP 136 NA NA NA NA
## 8 32 <NA> NOT_ROOT NOT_MIG NOT_TIP 2 172 3 104 8
## 9 51 V10 NOT_ROOT NOT_MIG TIP 76 NA NA NA NA
## 10 52 <NA> NOT_ROOT NOT_MIG NOT_TIP 104 76 2 256 4
## 11 75 V8 NOT_ROOT NOT_MIG TIP 76 NA NA NA NA
## 12 76 <NA> NOT_ROOT NOT_MIG NOT_TIP 52 51 1 75 1
## 13 103 V9 NOT_ROOT NOT_MIG TIP 304 NA NA NA NA
## 14 104 <NA> NOT_ROOT NOT_MIG NOT_TIP 32 52 6 304 2
## 15 135 V1 NOT_ROOT NOT_MIG TIP 136 NA NA NA NA
## 16 136 <NA> NOT_ROOT NOT_MIG NOT_TIP 212 135 1 31 1
## 17 171 V11 NOT_ROOT NOT_MIG TIP 172 NA NA NA NA
## 18 172 <NA> NOT_ROOT NOT_MIG NOT_TIP 32 16 2 171 1
## 19 211 V3 NOT_ROOT NOT_MIG TIP 212 NA NA NA NA
## 20 212 <NA> NOT_ROOT NOT_MIG NOT_TIP 256 136 2 211 1
## 21 255 V2 NOT_ROOT NOT_MIG TIP 256 NA NA NA NA
## 22 256 <NA> NOT_ROOT NOT_MIG NOT_TIP 52 255 1 212 3
## 23 303 V5 NOT_ROOT NOT_MIG TIP 304 NA NA NA NA
## 24 304 <NA> NOT_ROOT NOT_MIG NOT_TIP 104 303 1 103 1
## 25 305 <NA> ROOT NOT_MIG NOT_TIP 305 1 1 2 12
## 26 321 <NA> NOT_ROOT MIG NOT_TIP 32 172 NA NA NA
## 27 365 <NA> NOT_ROOT MIG NOT_TIP 136 31 NA NA NA
## V11
## 1 Zpa63:1.13067e-06
## 2 (Za17:2.26134e-06,(((V7:0.0253633,V6:0.005823):0.011378,V11:0.0234477):0.00952983,(((V10:0.0139363,V8:0.1872):0.0223637,(V2:0,((V1:0.0435682,V4:0.0579283):0.0111987,V3:0.033596):0.0119633):0.00114129):0.00837946,(V5:0.00707124,V9:0.0461891):0.0118319):0.00751057):0.182401):1.13067e-06
## 3 V6:0.005823
## 4 Za17:2.26134e-06
## 5 V7:0.0253633
## 6 (V7:0.0253633,V6:0.005823):0.011378
## 7 V4:0.0579283
## 8 (((V7:0.0253633,V6:0.005823):0.011378,V11:0.0234477):0.00952983,(((V10:0.0139363,V8:0.1872):0.0223637,(V2:0,((V1:0.0435682,V4:0.0579283):0.0111987,V3:0.033596):0.0119633):0.00114129):0.00837946,(V5:0.00707124,V9:0.0461891):0.0118319):0.00751057):0.182401
## 9 V10:0.0139363
## 10 ((V10:0.0139363,V8:0.1872):0.0223637,(V2:0,((V1:0.0435682,V4:0.0579283):0.0111987,V3:0.033596):0.0119633):0.00114129):0.00837946
## 11 V8:0.1872
## 12 (V10:0.0139363,V8:0.1872):0.0223637
## 13 V9:0.0461891
## 14 (((V10:0.0139363,V8:0.1872):0.0223637,(V2:0,((V1:0.0435682,V4:0.0579283):0.0111987,V3:0.033596):0.0119633):0.00114129):0.00837946,(V5:0.00707124,V9:0.0461891):0.0118319):0.00751057
## 15 V1:0.0435682
## 16 (V1:0.0435682,V4:0.0579283):0.0111987
## 17 V11:0.0234477
## 18 ((V7:0.0253633,V6:0.005823):0.011378,V11:0.0234477):0.00952983
## 19 V3:0.033596
## 20 ((V1:0.0435682,V4:0.0579283):0.0111987,V3:0.033596):0.0119633
## 21 V2:0
## 22 (V2:0,((V1:0.0435682,V4:0.0579283):0.0111987,V3:0.033596):0.0119633):0.00114129
## 23 V5:0.00707124
## 24 (V5:0.00707124,V9:0.0461891):0.0118319
## 25 (Zpa63:1.13067e-06,(Za17:2.26134e-06,(((V7:0.0253633,V6:0.005823):0.011378,V11:0.0234477):0.00952983,(((V10:0.0139363,V8:0.1872):0.0223637,(V2:0,((V1:0.0435682,V4:0.0579283):0.0111987,V3:0.033596):0.0119633):0.00114129):0.00837946,(V5:0.00707124,V9:0.0461891):0.0118319):0.00751057):0.182401):1.13067e-06);
## 26 <NA>
## 27 <NA>
## x y ymin ymax
## 1 1.130670e-06 0.96153846 0.92307692 1.00000000
## 2 1.130670e-06 0.84615385 0.00000000 0.92307692
## 3 2.091330e-01 0.73076923 0.69230769 0.76923077
## 4 3.392010e-06 0.88461538 0.84615385 0.92307692
## 5 2.286733e-01 0.80769231 0.76923077 0.84615385
## 6 2.033100e-01 0.76923077 0.69230769 0.84615385
## 7 2.805239e-01 0.26923077 0.23076923 0.30769231
## 8 1.824021e-01 0.61538462 0.00000000 0.84615385
## 9 2.345922e-01 0.57692308 0.53846154 0.61538462
## 10 1.982922e-01 0.46153846 0.15384615 0.61538462
## 11 2.719467e-01 0.50000000 0.46153846 0.53846154
## 12 2.206559e-01 0.53846154 0.46153846 0.61538462
## 13 2.356191e-01 0.03846154 0.00000000 0.07692308
## 14 1.899127e-01 0.15384615 0.00000000 0.61538462
## 15 2.661637e-01 0.34615385 0.30769231 0.38461538
## 16 2.225955e-01 0.30769231 0.23076923 0.38461538
## 17 2.153797e-01 0.65384615 0.61538462 0.69230769
## 18 1.919320e-01 0.69230769 0.61538462 0.84615385
## 19 2.449928e-01 0.19230769 0.15384615 0.23076923
## 20 2.113968e-01 0.23076923 0.15384615 0.38461538
## 21 1.994335e-01 0.42307692 0.38461538 0.46153846
## 22 1.994335e-01 0.38461538 0.15384615 0.46153846
## 23 2.088158e-01 0.11538462 0.07692308 0.15384615
## 24 2.017446e-01 0.07692308 0.00000000 0.15384615
## 25 0.000000e+00 0.92307692 0.00000000 1.00000000
## 26 1.881335e-01 NA 0.00000000 0.61538462
## 27 2.537494e-01 NA 0.23076923 0.30769231
## [1] "0.601412 0.18240213067 0.19193197067"
## [1] "0.5378 0.22259545067 0.28052385067"
## [1] 1.130670e-06 2.091330e-01 3.392010e-06 2.286733e-01 2.805239e-01
## [6] 2.345922e-01 2.719467e-01 2.356191e-01 2.661637e-01 2.153797e-01
## [11] 2.449928e-01 1.994335e-01 2.088158e-01
## [1] 0.003
## [1] "mse 0.000965112171597633"
t = plot_tree(paste0(PopStr_dir, "High_anc_coef_only_outgroups_positions.with_outgroups.m3.treemix.out"))
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
## 1 1 Zpa63 NOT_ROOT NOT_MIG TIP 305 NA NA NA NA
## 2 2 <NA> NOT_ROOT NOT_MIG NOT_TIP 305 4 1 32 11
## 3 3 V6 NOT_ROOT NOT_MIG TIP 16 NA NA NA NA
## 4 4 Za17 NOT_ROOT NOT_MIG TIP 2 NA NA NA NA
## 5 15 V7 NOT_ROOT NOT_MIG TIP 16 NA NA NA NA
## 6 16 <NA> NOT_ROOT NOT_MIG NOT_TIP 172 15 1 3 1
## 7 31 V4 NOT_ROOT NOT_MIG TIP 136 NA NA NA NA
## 8 32 <NA> NOT_ROOT NOT_MIG NOT_TIP 2 172 3 104 8
## 9 51 V10 NOT_ROOT NOT_MIG TIP 76 NA NA NA NA
## 10 52 <NA> NOT_ROOT NOT_MIG NOT_TIP 104 76 2 256 4
## 11 75 V8 NOT_ROOT NOT_MIG TIP 76 NA NA NA NA
## 12 76 <NA> NOT_ROOT NOT_MIG NOT_TIP 52 51 1 75 1
## 13 103 V9 NOT_ROOT NOT_MIG TIP 304 NA NA NA NA
## 14 104 <NA> NOT_ROOT NOT_MIG NOT_TIP 32 52 6 304 2
## 15 135 V1 NOT_ROOT NOT_MIG TIP 136 NA NA NA NA
## 16 136 <NA> NOT_ROOT NOT_MIG NOT_TIP 212 135 1 31 1
## 17 171 V11 NOT_ROOT NOT_MIG TIP 172 NA NA NA NA
## 18 172 <NA> NOT_ROOT NOT_MIG NOT_TIP 32 16 2 171 1
## 19 211 V3 NOT_ROOT NOT_MIG TIP 212 NA NA NA NA
## 20 212 <NA> NOT_ROOT NOT_MIG NOT_TIP 256 136 2 211 1
## 21 255 V2 NOT_ROOT NOT_MIG TIP 256 NA NA NA NA
## 22 256 <NA> NOT_ROOT NOT_MIG NOT_TIP 52 255 1 212 3
## 23 303 V5 NOT_ROOT NOT_MIG TIP 304 NA NA NA NA
## 24 304 <NA> NOT_ROOT NOT_MIG NOT_TIP 104 303 1 103 1
## 25 305 <NA> ROOT NOT_MIG NOT_TIP 305 1 1 2 12
## 26 321 <NA> NOT_ROOT MIG NOT_TIP 32 172 NA NA NA
## 27 365 <NA> NOT_ROOT MIG NOT_TIP 136 31 NA NA NA
## 28 404 <NA> NOT_ROOT MIG NOT_TIP 136 31 NA NA NA
## V11
## 1 Zpa63:1.16913e-06
## 2 (Za17:2.33828e-06,(((V7:0.0253634,V6:0.00582308):0.011378,V11:0.0234478):0.0095298,(((V10:0.0139909,V8:0.186654):0.0222929,(V2:0,((V1:0.0425648,V4:0.058998):0.019938,V3:0.0589117):0.00321):0.00114881):0.00836871,(V5:0.00723673,V9:0.0448731):0.011667):0.00751525):0.182401):1.16913e-06
## 3 V6:0.00582308
## 4 Za17:2.33828e-06
## 5 V7:0.0253634
## 6 (V7:0.0253634,V6:0.00582308):0.011378
## 7 V4:0.058998
## 8 (((V7:0.0253634,V6:0.00582308):0.011378,V11:0.0234478):0.0095298,(((V10:0.0139909,V8:0.186654):0.0222929,(V2:0,((V1:0.0425648,V4:0.058998):0.019938,V3:0.0589117):0.00321):0.00114881):0.00836871,(V5:0.00723673,V9:0.0448731):0.011667):0.00751525):0.182401
## 9 V10:0.0139909
## 10 ((V10:0.0139909,V8:0.186654):0.0222929,(V2:0,((V1:0.0425648,V4:0.058998):0.019938,V3:0.0589117):0.00321):0.00114881):0.00836871
## 11 V8:0.186654
## 12 (V10:0.0139909,V8:0.186654):0.0222929
## 13 V9:0.0448731
## 14 (((V10:0.0139909,V8:0.186654):0.0222929,(V2:0,((V1:0.0425648,V4:0.058998):0.019938,V3:0.0589117):0.00321):0.00114881):0.00836871,(V5:0.00723673,V9:0.0448731):0.011667):0.00751525
## 15 V1:0.0425648
## 16 (V1:0.0425648,V4:0.058998):0.019938
## 17 V11:0.0234478
## 18 ((V7:0.0253634,V6:0.00582308):0.011378,V11:0.0234478):0.0095298
## 19 V3:0.0589117
## 20 ((V1:0.0425648,V4:0.058998):0.019938,V3:0.0589117):0.00321
## 21 V2:0
## 22 (V2:0,((V1:0.0425648,V4:0.058998):0.019938,V3:0.0589117):0.00321):0.00114881
## 23 V5:0.00723673
## 24 (V5:0.00723673,V9:0.0448731):0.011667
## 25 (Zpa63:1.16913e-06,(Za17:2.33828e-06,(((V7:0.0253634,V6:0.00582308):0.011378,V11:0.0234478):0.0095298,(((V10:0.0139909,V8:0.186654):0.0222929,(V2:0,((V1:0.0425648,V4:0.058998):0.019938,V3:0.0589117):0.00321):0.00114881):0.00836871,(V5:0.00723673,V9:0.0448731):0.011667):0.00751525):0.182401):1.16913e-06);
## 26 <NA>
## 27 <NA>
## 28 <NA>
## x y ymin ymax
## 1 1.169130e-06 0.96153846 0.92307692 1.00000000
## 2 1.169130e-06 0.84615385 0.00000000 0.92307692
## 3 2.091330e-01 0.73076923 0.69230769 0.76923077
## 4 3.507410e-06 0.88461538 0.84615385 0.92307692
## 5 2.286734e-01 0.80769231 0.76923077 0.84615385
## 6 2.033100e-01 0.76923077 0.69230769 0.84615385
## 7 2.815809e-01 0.26923077 0.23076923 0.30769231
## 8 1.824022e-01 0.61538462 0.00000000 0.84615385
## 9 2.345699e-01 0.57692308 0.53846154 0.61538462
## 10 1.982861e-01 0.46153846 0.15384615 0.61538462
## 11 2.718618e-01 0.50000000 0.46153846 0.53846154
## 12 2.205790e-01 0.53846154 0.46153846 0.61538462
## 13 2.353702e-01 0.03846154 0.00000000 0.07692308
## 14 1.899174e-01 0.15384615 0.00000000 0.61538462
## 15 2.651477e-01 0.34615385 0.30769231 0.38461538
## 16 2.225829e-01 0.30769231 0.23076923 0.38461538
## 17 2.153798e-01 0.65384615 0.61538462 0.69230769
## 18 1.919320e-01 0.69230769 0.61538462 0.84615385
## 19 2.425220e-01 0.19230769 0.15384615 0.23076923
## 20 2.026449e-01 0.23076923 0.15384615 0.38461538
## 21 1.994349e-01 0.42307692 0.38461538 0.46153846
## 22 1.994349e-01 0.38461538 0.15384615 0.46153846
## 23 2.088211e-01 0.11538462 0.07692308 0.15384615
## 24 2.015844e-01 0.07692308 0.00000000 0.15384615
## 25 0.000000e+00 0.92307692 0.00000000 1.00000000
## 26 1.881391e-01 NA 0.00000000 0.61538462
## 27 2.630530e-01 NA 0.23076923 0.30769231
## 28 2.815809e-01 NA 0.23076923 0.30769231
## [1] "0.602 0.18240216913 0.19193196913"
## [1] "0.685957 0.22258293913 0.28158093913"
## [1] "1 0.22258293913 0.28158093913"
## [1] 1.169130e-06 2.091330e-01 3.507410e-06 2.286734e-01 2.815809e-01
## [6] 2.345699e-01 2.718618e-01 2.353702e-01 2.651477e-01 2.153798e-01
## [11] 2.425220e-01 1.994349e-01 2.088211e-01
## [1] 0.003
## [1] "mse 0.000965112171597633"
t = plot_tree(paste0(PopStr_dir, "High_anc_coef_only_outgroups_positions.with_outgroups.m4.treemix.out"))
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
## 1 1 Zpa63 NOT_ROOT NOT_MIG TIP 305 NA NA NA NA
## 2 2 <NA> NOT_ROOT NOT_MIG NOT_TIP 305 4 1 32 11
## 3 3 V6 NOT_ROOT NOT_MIG TIP 16 NA NA NA NA
## 4 4 Za17 NOT_ROOT NOT_MIG TIP 2 NA NA NA NA
## 5 15 V7 NOT_ROOT NOT_MIG TIP 16 NA NA NA NA
## 6 16 <NA> NOT_ROOT NOT_MIG NOT_TIP 172 3 1 15 1
## 7 31 V4 NOT_ROOT NOT_MIG TIP 136 NA NA NA NA
## 8 32 <NA> NOT_ROOT NOT_MIG NOT_TIP 2 172 3 104 8
## 9 51 V10 NOT_ROOT NOT_MIG TIP 76 NA NA NA NA
## 10 52 <NA> NOT_ROOT NOT_MIG NOT_TIP 104 304 2 76 2
## 11 75 V8 NOT_ROOT NOT_MIG TIP 76 NA NA NA NA
## 12 76 <NA> NOT_ROOT NOT_MIG NOT_TIP 52 51 1 75 1
## 13 103 V9 NOT_ROOT NOT_MIG TIP 304 NA NA NA NA
## 14 104 <NA> NOT_ROOT NOT_MIG NOT_TIP 32 256 4 52 4
## 15 135 V1 NOT_ROOT NOT_MIG TIP 136 NA NA NA NA
## 16 136 <NA> NOT_ROOT NOT_MIG NOT_TIP 212 135 1 31 1
## 17 171 V11 NOT_ROOT NOT_MIG TIP 172 NA NA NA NA
## 18 172 <NA> NOT_ROOT NOT_MIG NOT_TIP 32 16 2 171 1
## 19 211 V3 NOT_ROOT NOT_MIG TIP 212 NA NA NA NA
## 20 212 <NA> NOT_ROOT NOT_MIG NOT_TIP 256 136 2 211 1
## 21 255 V2 NOT_ROOT NOT_MIG TIP 256 NA NA NA NA
## 22 256 <NA> NOT_ROOT NOT_MIG NOT_TIP 104 255 1 212 3
## 23 303 V5 NOT_ROOT NOT_MIG TIP 304 NA NA NA NA
## 24 304 <NA> NOT_ROOT NOT_MIG NOT_TIP 52 103 1 303 1
## 25 305 <NA> ROOT NOT_MIG NOT_TIP 305 1 1 2 12
## 26 319 <NA> NOT_ROOT MIG NOT_TIP 32 172 NA NA NA
## 27 365 <NA> NOT_ROOT MIG NOT_TIP 136 31 NA NA NA
## 28 404 <NA> NOT_ROOT MIG NOT_TIP 136 31 NA NA NA
## 29 481 <NA> NOT_ROOT MIG NOT_TIP 32 172 NA NA NA
## V11
## 1 Zpa63:1.16484e-06
## 2 (Za17:2.3297e-06,(((V6:0.00582307,V7:0.0253634):0.011378,V11:0.0234478):0.0103609,((V2:0,((V1:0.0425672,V4:0.0589969):0.0199511,V3:0.0589596):0.00319451):0.00116464,((V9:0.0448765,V5:0.00724181):0.0449264,(V10:0.0150421,V8:0.167223):0.0212643):0):0.0157095):0.181488):1.16484e-06
## 3 V6:0.00582307
## 4 Za17:2.3297e-06
## 5 V7:0.0253634
## 6 (V6:0.00582307,V7:0.0253634):0.011378
## 7 V4:0.0589969
## 8 (((V6:0.00582307,V7:0.0253634):0.011378,V11:0.0234478):0.0103609,((V2:0,((V1:0.0425672,V4:0.0589969):0.0199511,V3:0.0589596):0.00319451):0.00116464,((V9:0.0448765,V5:0.00724181):0.0449264,(V10:0.0150421,V8:0.167223):0.0212643):0):0.0157095):0.181488
## 9 V10:0.0150421
## 10 ((V9:0.0448765,V5:0.00724181):0.0449264,(V10:0.0150421,V8:0.167223):0.0212643):0
## 11 V8:0.167223
## 12 (V10:0.0150421,V8:0.167223):0.0212643
## 13 V9:0.0448765
## 14 ((V2:0,((V1:0.0425672,V4:0.0589969):0.0199511,V3:0.0589596):0.00319451):0.00116464,((V9:0.0448765,V5:0.00724181):0.0449264,(V10:0.0150421,V8:0.167223):0.0212643):0):0.0157095
## 15 V1:0.0425672
## 16 (V1:0.0425672,V4:0.0589969):0.0199511
## 17 V11:0.0234478
## 18 ((V6:0.00582307,V7:0.0253634):0.011378,V11:0.0234478):0.0103609
## 19 V3:0.0589596
## 20 ((V1:0.0425672,V4:0.0589969):0.0199511,V3:0.0589596):0.00319451
## 21 V2:0
## 22 (V2:0,((V1:0.0425672,V4:0.0589969):0.0199511,V3:0.0589596):0.00319451):0.00116464
## 23 V5:0.00724181
## 24 (V9:0.0448765,V5:0.00724181):0.0449264
## 25 (Zpa63:1.16484e-06,(Za17:2.3297e-06,(((V6:0.00582307,V7:0.0253634):0.011378,V11:0.0234478):0.0103609,((V2:0,((V1:0.0425672,V4:0.0589969):0.0199511,V3:0.0589596):0.00319451):0.00116464,((V9:0.0448765,V5:0.00724181):0.0449264,(V10:0.0150421,V8:0.167223):0.0212643):0):0.0157095):0.181488):1.16484e-06);
## 26 <NA>
## 27 <NA>
## 28 <NA>
## 29 <NA>
## x y ymin ymax
## 1 1.164840e-06 0.96153846 0.92307692 1.00000000
## 2 1.164840e-06 0.84615385 0.00000000 0.92307692
## 3 2.090511e-01 0.80769231 0.76923077 0.84615385
## 4 3.494540e-06 0.88461538 0.84615385 0.92307692
## 5 2.285914e-01 0.73076923 0.69230769 0.76923077
## 6 2.032280e-01 0.76923077 0.69230769 0.84615385
## 7 2.805058e-01 0.42307692 0.38461538 0.46153846
## 8 1.814892e-01 0.61538462 0.00000000 0.84615385
## 9 2.335051e-01 0.11538462 0.07692308 0.15384615
## 10 1.971987e-01 0.15384615 0.00000000 0.30769231
## 11 2.696251e-01 0.03846154 0.00000000 0.07692308
## 12 2.184630e-01 0.07692308 0.00000000 0.15384615
## 13 2.471303e-01 0.26923077 0.23076923 0.30769231
## 14 1.971987e-01 0.30769231 0.00000000 0.61538462
## 15 2.640761e-01 0.50000000 0.46153846 0.53846154
## 16 2.215089e-01 0.46153846 0.38461538 0.53846154
## 17 2.152978e-01 0.65384615 0.61538462 0.69230769
## 18 1.918500e-01 0.69230769 0.61538462 0.84615385
## 19 2.414438e-01 0.34615385 0.30769231 0.38461538
## 20 2.015578e-01 0.38461538 0.30769231 0.53846154
## 21 1.983633e-01 0.57692308 0.53846154 0.61538462
## 22 1.983633e-01 0.53846154 0.30769231 0.61538462
## 23 2.205882e-01 0.19230769 0.15384615 0.23076923
## 24 2.133464e-01 0.23076923 0.15384615 0.30769231
## 25 0.000000e+00 0.92307692 0.00000000 1.00000000
## 26 1.896468e-01 NA 0.00000000 0.61538462
## 27 2.617726e-01 NA 0.38461538 0.46153846
## 28 2.805058e-01 NA 0.38461538 0.46153846
## 29 1.918500e-01 NA 0.00000000 0.61538462
## [1] "0.787347 0.18148916484 0.19185004524"
## [1] "0.682471 0.22150891484 0.28050581484"
## [1] "1 0.22150891484 0.28050581484"
## [1] "0.787476 0.18148916484 0.19185004524"
## [1] 1.164840e-06 2.090511e-01 3.494540e-06 2.285914e-01 2.805058e-01
## [6] 2.335051e-01 2.696251e-01 2.471303e-01 2.640761e-01 2.152978e-01
## [11] 2.414438e-01 1.983633e-01 2.205882e-01
## [1] 0.003
## [1] "mse 0.000965112171597633"
t = plot_tree(paste0(PopStr_dir, "High_anc_coef_only_outgroups_positions.with_outgroups.m5.treemix.out"))
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
## 1 1 Zpa63 NOT_ROOT NOT_MIG TIP 305 NA NA NA NA
## 2 2 <NA> NOT_ROOT NOT_MIG NOT_TIP 305 4 1 32 11
## 3 3 V6 NOT_ROOT NOT_MIG TIP 16 NA NA NA NA
## 4 4 Za17 NOT_ROOT NOT_MIG TIP 2 NA NA NA NA
## 5 15 V7 NOT_ROOT NOT_MIG TIP 16 NA NA NA NA
## 6 16 <NA> NOT_ROOT NOT_MIG NOT_TIP 172 3 1 15 1
## 7 31 V4 NOT_ROOT NOT_MIG TIP 136 NA NA NA NA
## 8 32 <NA> NOT_ROOT NOT_MIG NOT_TIP 2 172 3 104 8
## 9 51 V10 NOT_ROOT NOT_MIG TIP 76 NA NA NA NA
## 10 52 <NA> NOT_ROOT NOT_MIG NOT_TIP 104 76 2 304 2
## 11 75 V8 NOT_ROOT NOT_MIG TIP 76 NA NA NA NA
## 12 76 <NA> NOT_ROOT NOT_MIG NOT_TIP 52 51 1 75 1
## 13 103 V9 NOT_ROOT NOT_MIG TIP 304 NA NA NA NA
## 14 104 <NA> NOT_ROOT NOT_MIG NOT_TIP 32 256 4 52 4
## 15 135 V1 NOT_ROOT NOT_MIG TIP 136 NA NA NA NA
## 16 136 <NA> NOT_ROOT NOT_MIG NOT_TIP 212 135 1 31 1
## 17 171 V11 NOT_ROOT NOT_MIG TIP 172 NA NA NA NA
## 18 172 <NA> NOT_ROOT NOT_MIG NOT_TIP 32 16 2 171 1
## 19 211 V3 NOT_ROOT NOT_MIG TIP 212 NA NA NA NA
## 20 212 <NA> NOT_ROOT NOT_MIG NOT_TIP 256 136 2 211 1
## 21 255 V2 NOT_ROOT NOT_MIG TIP 256 NA NA NA NA
## 22 256 <NA> NOT_ROOT NOT_MIG NOT_TIP 104 255 1 212 3
## 23 303 V5 NOT_ROOT NOT_MIG TIP 304 NA NA NA NA
## 24 304 <NA> NOT_ROOT NOT_MIG NOT_TIP 52 303 1 103 1
## 25 305 <NA> ROOT NOT_MIG NOT_TIP 305 1 1 2 12
## 26 321 <NA> NOT_ROOT MIG NOT_TIP 32 172 NA NA NA
## 27 365 <NA> NOT_ROOT MIG NOT_TIP 136 31 NA NA NA
## 28 404 <NA> NOT_ROOT MIG NOT_TIP 136 31 NA NA NA
## 29 484 <NA> NOT_ROOT MIG NOT_TIP 32 172 NA NA NA
## 30 533 <NA> NOT_ROOT MIG NOT_TIP 212 211 NA NA NA
## V11
## 1 Zpa63:1.43583e-06
## 2 (Za17:2.87164e-06,(((V6:0.00582361,V7:0.0253639):0.011378,V11:0.0234483):0.0103338,((V2:0,((V1:0.042426,V4:0.0589594):0.0213852,V3:0.068422):0.00197171):0.00325345,((V10:0.0154789,V8:0.160978):0.0646884,(V5:0.00726655,V9:0.0445142):0.0345319):0):0.0133917):0.181589):1.43583e-06
## 3 V6:0.00582361
## 4 Za17:2.87164e-06
## 5 V7:0.0253639
## 6 (V6:0.00582361,V7:0.0253639):0.011378
## 7 V4:0.0589594
## 8 (((V6:0.00582361,V7:0.0253639):0.011378,V11:0.0234483):0.0103338,((V2:0,((V1:0.042426,V4:0.0589594):0.0213852,V3:0.068422):0.00197171):0.00325345,((V10:0.0154789,V8:0.160978):0.0646884,(V5:0.00726655,V9:0.0445142):0.0345319):0):0.0133917):0.181589
## 9 V10:0.0154789
## 10 ((V10:0.0154789,V8:0.160978):0.0646884,(V5:0.00726655,V9:0.0445142):0.0345319):0
## 11 V8:0.160978
## 12 (V10:0.0154789,V8:0.160978):0.0646884
## 13 V9:0.0445142
## 14 ((V2:0,((V1:0.042426,V4:0.0589594):0.0213852,V3:0.068422):0.00197171):0.00325345,((V10:0.0154789,V8:0.160978):0.0646884,(V5:0.00726655,V9:0.0445142):0.0345319):0):0.0133917
## 15 V1:0.042426
## 16 (V1:0.042426,V4:0.0589594):0.0213852
## 17 V11:0.0234483
## 18 ((V6:0.00582361,V7:0.0253639):0.011378,V11:0.0234483):0.0103338
## 19 V3:0.068422
## 20 ((V1:0.042426,V4:0.0589594):0.0213852,V3:0.068422):0.00197171
## 21 V2:0
## 22 (V2:0,((V1:0.042426,V4:0.0589594):0.0213852,V3:0.068422):0.00197171):0.00325345
## 23 V5:0.00726655
## 24 (V5:0.00726655,V9:0.0445142):0.0345319
## 25 (Zpa63:1.43583e-06,(Za17:2.87164e-06,(((V6:0.00582361,V7:0.0253639):0.011378,V11:0.0234483):0.0103338,((V2:0,((V1:0.042426,V4:0.0589594):0.0213852,V3:0.068422):0.00197171):0.00325345,((V10:0.0154789,V8:0.160978):0.0646884,(V5:0.00726655,V9:0.0445142):0.0345319):0):0.0133917):0.181589):1.43583e-06);
## 26 <NA>
## 27 <NA>
## 28 <NA>
## 29 <NA>
## 30 <NA>
## x y ymin ymax
## 1 1.435830e-06 0.96153846 0.92307692 1.00000000
## 2 1.435830e-06 0.84615385 0.00000000 0.92307692
## 3 2.091258e-01 0.80769231 0.76923077 0.84615385
## 4 4.307470e-06 0.88461538 0.84615385 0.92307692
## 5 2.286661e-01 0.73076923 0.69230769 0.76923077
## 6 2.033022e-01 0.76923077 0.69230769 0.84615385
## 7 2.805520e-01 0.42307692 0.38461538 0.46153846
## 8 1.815904e-01 0.61538462 0.00000000 0.84615385
## 9 2.316277e-01 0.26923077 0.23076923 0.30769231
## 10 1.949821e-01 0.15384615 0.00000000 0.30769231
## 11 2.672453e-01 0.19230769 0.15384615 0.23076923
## 12 2.161488e-01 0.23076923 0.15384615 0.30769231
## 13 2.441714e-01 0.03846154 0.00000000 0.07692308
## 14 1.949821e-01 0.30769231 0.00000000 0.61538462
## 15 2.640185e-01 0.50000000 0.46153846 0.53846154
## 16 2.215925e-01 0.46153846 0.38461538 0.53846154
## 17 2.153725e-01 0.65384615 0.61538462 0.69230769
## 18 1.919242e-01 0.69230769 0.61538462 0.84615385
## 19 2.409830e-01 0.34615385 0.30769231 0.38461538
## 20 2.002073e-01 0.38461538 0.30769231 0.53846154
## 21 1.982356e-01 0.57692308 0.53846154 0.61538462
## 22 1.982356e-01 0.53846154 0.30769231 0.61538462
## 23 2.176475e-01 0.11538462 0.07692308 0.15384615
## 24 2.103809e-01 0.07692308 0.00000000 0.15384615
## 25 0.000000e+00 0.92307692 0.00000000 1.00000000
## 26 1.902816e-01 NA 0.00000000 0.61538462
## 27 2.646733e-01 NA 0.38461538 0.46153846
## 28 2.805520e-01 NA 0.38461538 0.46153846
## 29 1.919242e-01 NA 0.00000000 0.61538462
## 30 2.038858e-01 NA 0.30769231 0.38461538
## [1] "0.841047 0.18159043583 0.19192423583"
## [1] "0.730685 0.22159249583 0.28055198583"
## [1] "0.809339 0.22159249583 0.28055198583"
## [1] "0.841047 0.18159043583 0.19192423583"
## [1] "0.0902123 0.20020729583 0.240982972350924"
## [1] 1.435830e-06 2.091258e-01 4.307470e-06 2.286661e-01 2.805520e-01
## [6] 2.316277e-01 2.672453e-01 2.441714e-01 2.640185e-01 2.153725e-01
## [11] 2.409830e-01 1.982356e-01 2.176475e-01
## [1] 0.003
## [1] "mse 0.000965112171597633"
t = plot_tree(paste0(PopStr_dir, "High_anc_coef_only_outgroups_positions.with_outgroups.m6.treemix.out"))
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
## 1 1 Zpa63 NOT_ROOT NOT_MIG TIP 305 NA NA NA NA
## 2 2 <NA> NOT_ROOT NOT_MIG NOT_TIP 305 32 11 4 1
## 3 3 V6 NOT_ROOT NOT_MIG TIP 16 NA NA NA NA
## 4 4 Za17 NOT_ROOT NOT_MIG TIP 2 NA NA NA NA
## 5 15 V7 NOT_ROOT NOT_MIG TIP 16 NA NA NA NA
## 6 16 <NA> NOT_ROOT NOT_MIG NOT_TIP 172 3 1 15 1
## 7 31 V4 NOT_ROOT NOT_MIG TIP 136 NA NA NA NA
## 8 32 <NA> NOT_ROOT NOT_MIG NOT_TIP 2 172 3 104 8
## 9 51 V10 NOT_ROOT NOT_MIG TIP 76 NA NA NA NA
## 10 52 <NA> NOT_ROOT MIG NOT_TIP 104 304 NA NA NA
## 11 75 V8 NOT_ROOT NOT_MIG TIP 76 NA NA NA NA
## 12 76 <NA> NOT_ROOT NOT_MIG NOT_TIP 533 75 1 51 1
## 13 103 V9 NOT_ROOT NOT_MIG TIP 304 NA NA NA NA
## 14 104 <NA> NOT_ROOT NOT_MIG NOT_TIP 32 256 6 304 2
## 15 135 V1 NOT_ROOT NOT_MIG TIP 136 NA NA NA NA
## 16 136 <NA> NOT_ROOT NOT_MIG NOT_TIP 212 135 1 31 1
## 17 171 V11 NOT_ROOT NOT_MIG TIP 172 NA NA NA NA
## 18 172 <NA> NOT_ROOT NOT_MIG NOT_TIP 32 16 2 171 1
## 19 211 V3 NOT_ROOT NOT_MIG TIP 533 NA NA NA NA
## 20 212 <NA> NOT_ROOT NOT_MIG NOT_TIP 256 136 2 533 3
## 21 255 V2 NOT_ROOT NOT_MIG TIP 256 NA NA NA NA
## 22 256 <NA> NOT_ROOT NOT_MIG NOT_TIP 104 255 1 212 5
## 23 303 V5 NOT_ROOT NOT_MIG TIP 304 NA NA NA NA
## 24 304 <NA> NOT_ROOT NOT_MIG NOT_TIP 104 303 1 103 1
## 25 305 <NA> ROOT NOT_MIG NOT_TIP 305 1 1 2 12
## 26 321 <NA> NOT_ROOT MIG NOT_TIP 32 172 NA NA NA
## 27 365 <NA> NOT_ROOT MIG NOT_TIP 136 31 NA NA NA
## 28 404 <NA> NOT_ROOT MIG NOT_TIP 136 31 NA NA NA
## 29 481 <NA> NOT_ROOT MIG NOT_TIP 32 172 NA NA NA
## 30 533 <NA> NOT_ROOT NOT_MIG NOT_TIP 212 76 2 211 1
## 31 620 <NA> NOT_ROOT MIG NOT_TIP 305 2 NA NA NA
## V11
## 1 Zpa63:0
## 2 ((((V6:0.00582354,V7:0.0253638):0.011378,V11:0.0234483):0.00982902,((V2:0,((V1:0.0425007,V4:0.0589566):0.0222719,((V8:0.176954,V10:0.0147421):0.0435919,V3:0.071955):0.00424389):0.00103085):0.00318635,(V5:0.0072335,V9:0.0448782):0.0343367):0.0136708):0.181977,Za17:2.2394e-06):2.84783e-06
## 3 V6:0.00582354
## 4 Za17:2.2394e-06
## 5 V7:0.0253638
## 6 (V6:0.00582354,V7:0.0253638):0.011378
## 7 V4:0.0589566
## 8 (((V6:0.00582354,V7:0.0253638):0.011378,V11:0.0234483):0.00982902,((V2:0,((V1:0.0425007,V4:0.0589566):0.0222719,((V8:0.176954,V10:0.0147421):0.0435919,V3:0.071955):0.00424389):0.00103085):0.00318635,(V5:0.0072335,V9:0.0448782):0.0343367):0.0136708):0.181977
## 9 V10:0.0147421
## 10 <NA>
## 11 V8:0.176954
## 12 (V8:0.176954,V10:0.0147421):0.0435919
## 13 V9:0.0448782
## 14 ((V2:0,((V1:0.0425007,V4:0.0589566):0.0222719,((V8:0.176954,V10:0.0147421):0.0435919,V3:0.071955):0.00424389):0.00103085):0.00318635,(V5:0.0072335,V9:0.0448782):0.0343367):0.0136708
## 15 V1:0.0425007
## 16 (V1:0.0425007,V4:0.0589566):0.0222719
## 17 V11:0.0234483
## 18 ((V6:0.00582354,V7:0.0253638):0.011378,V11:0.0234483):0.00982902
## 19 V3:0.071955
## 20 ((V1:0.0425007,V4:0.0589566):0.0222719,((V8:0.176954,V10:0.0147421):0.0435919,V3:0.071955):0.00424389):0.00103085
## 21 V2:0
## 22 (V2:0,((V1:0.0425007,V4:0.0589566):0.0222719,((V8:0.176954,V10:0.0147421):0.0435919,V3:0.071955):0.00424389):0.00103085):0.00318635
## 23 V5:0.0072335
## 24 (V5:0.0072335,V9:0.0448782):0.0343367
## 25 (Zpa63:0,((((V6:0.00582354,V7:0.0253638):0.011378,V11:0.0234483):0.00982902,((V2:0,((V1:0.0425007,V4:0.0589566):0.0222719,((V8:0.176954,V10:0.0147421):0.0435919,V3:0.071955):0.00424389):0.00103085):0.00318635,(V5:0.0072335,V9:0.0448782):0.0343367):0.0136708):0.181977,Za17:2.2394e-06):2.84783e-06);
## 26 <NA>
## 27 <NA>
## 28 <NA>
## 29 <NA>
## 30 ((V8:0.176954,V10:0.0147421):0.0435919,V3:0.071955):0.00424389
## 31 <NA>
## x y ymin ymax
## 1 0.000000e+00 0.96153846 0.92307692 1.00000000
## 2 2.847830e-06 0.07692308 0.00000000 0.92307692
## 3 2.090104e-01 0.88461538 0.84615385 0.92307692
## 4 5.087230e-06 0.03846154 0.00000000 0.07692308
## 5 2.285507e-01 0.80769231 0.76923077 0.84615385
## 6 2.031869e-01 0.84615385 0.76923077 0.92307692
## 7 2.810964e-01 0.50000000 0.46153846 0.53846154
## 8 1.819798e-01 0.69230769 0.07692308 0.92307692
## 9 2.409417e-01 0.34615385 0.30769231 0.38461538
## 10 1.966620e-01 NA 0.07692308 0.23076923
## 11 2.776415e-01 0.42307692 0.38461538 0.46153846
## 12 2.266214e-01 0.38461538 0.30769231 0.46153846
## 13 2.447446e-01 0.11538462 0.07692308 0.15384615
## 14 1.956506e-01 0.23076923 0.07692308 0.69230769
## 15 2.646404e-01 0.57692308 0.53846154 0.61538462
## 16 2.221397e-01 0.53846154 0.46153846 0.61538462
## 17 2.152572e-01 0.73076923 0.69230769 0.76923077
## 18 1.918089e-01 0.76923077 0.69230769 0.92307692
## 19 2.427693e-01 0.26923077 0.23076923 0.30769231
## 20 1.998678e-01 0.46153846 0.23076923 0.61538462
## 21 1.988370e-01 0.65384615 0.61538462 0.69230769
## 22 1.988370e-01 0.61538462 0.23076923 0.69230769
## 23 2.181631e-01 0.19230769 0.15384615 0.23076923
## 24 2.109296e-01 0.15384615 0.07692308 0.23076923
## 25 0.000000e+00 0.92307692 0.00000000 1.00000000
## 26 1.896387e-01 NA 0.07692308 0.69230769
## 27 2.617896e-01 NA 0.46153846 0.53846154
## 28 2.810964e-01 NA 0.46153846 0.53846154
## 29 1.918089e-01 NA 0.07692308 0.69230769
## 30 2.041117e-01 0.30769231 0.23076923 0.46153846
## 31 0.000000e+00 NA 0.00000000 0.92307692
## [1] "0.0661926 0.19565064783 0.210929568478836"
## [1] "0.779213 0.18197984783 0.19180886783"
## [1] "0.672527 0.22213974783 0.28109639383"
## [1] "0.684004 0.22213974783 0.28109639383"
## [1] "0.779213 0.18197984783 0.19180886783"
## [1] "0 0 2.84783e-06"
## [1] 0.000000e+00 2.090104e-01 5.087230e-06 2.285507e-01 2.810964e-01
## [6] 2.409417e-01 2.776415e-01 2.447446e-01 2.646404e-01 2.152572e-01
## [11] 2.427693e-01 1.988370e-01 2.181631e-01
## [1] 0.003
## [1] "mse 0.000965112171597633"
t = plot_tree(paste0(PopStr_dir, "High_anc_coef_only_outgroups_positions.with_outgroups.m7.treemix.out"))
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
## 1 1 Zpa63 NOT_ROOT NOT_MIG TIP 305 NA NA NA NA
## 2 2 <NA> NOT_ROOT NOT_MIG NOT_TIP 305 4 1 32 11
## 3 3 V6 NOT_ROOT NOT_MIG TIP 16 NA NA NA NA
## 4 4 Za17 NOT_ROOT NOT_MIG TIP 2 NA NA NA NA
## 5 15 V7 NOT_ROOT NOT_MIG TIP 16 NA NA NA NA
## 6 16 <NA> NOT_ROOT NOT_MIG NOT_TIP 172 3 1 15 1
## 7 31 V4 NOT_ROOT NOT_MIG TIP 136 NA NA NA NA
## 8 32 <NA> NOT_ROOT NOT_MIG NOT_TIP 2 172 3 104 8
## 9 51 V10 NOT_ROOT NOT_MIG TIP 76 NA NA NA NA
## 10 52 <NA> NOT_ROOT MIG NOT_TIP 104 304 NA NA NA
## 11 75 V8 NOT_ROOT NOT_MIG TIP 76 NA NA NA NA
## 12 76 <NA> NOT_ROOT NOT_MIG NOT_TIP 533 51 1 75 1
## 13 103 V9 NOT_ROOT NOT_MIG TIP 304 NA NA NA NA
## 14 104 <NA> NOT_ROOT NOT_MIG NOT_TIP 32 256 6 304 2
## 15 135 V1 NOT_ROOT NOT_MIG TIP 136 NA NA NA NA
## 16 136 <NA> NOT_ROOT NOT_MIG NOT_TIP 212 31 1 135 1
## 17 171 V11 NOT_ROOT NOT_MIG TIP 172 NA NA NA NA
## 18 172 <NA> NOT_ROOT NOT_MIG NOT_TIP 32 171 1 16 2
## 19 211 V3 NOT_ROOT NOT_MIG TIP 533 NA NA NA NA
## 20 212 <NA> NOT_ROOT NOT_MIG NOT_TIP 256 136 2 533 3
## 21 255 V2 NOT_ROOT NOT_MIG TIP 256 NA NA NA NA
## 22 256 <NA> NOT_ROOT NOT_MIG NOT_TIP 104 255 1 212 5
## 23 303 V5 NOT_ROOT NOT_MIG TIP 304 NA NA NA NA
## 24 304 <NA> NOT_ROOT NOT_MIG NOT_TIP 104 303 1 103 1
## 25 305 <NA> ROOT NOT_MIG NOT_TIP 305 1 1 2 12
## 26 321 <NA> NOT_ROOT MIG NOT_TIP 32 172 NA NA NA
## 27 365 <NA> NOT_ROOT MIG NOT_TIP 136 31 NA NA NA
## 28 404 <NA> NOT_ROOT MIG NOT_TIP 136 31 NA NA NA
## 29 481 <NA> NOT_ROOT MIG NOT_TIP 32 172 NA NA NA
## 30 533 <NA> NOT_ROOT NOT_MIG NOT_TIP 212 76 2 211 1
## 31 702 <NA> NOT_ROOT MIG NOT_TIP 304 103 NA NA NA
## 32 620 <NA> NOT_ROOT MIG NOT_TIP 2 32 NA NA NA
## V11
## 1 Zpa63:8.71101e-07
## 2 (Za17:0,((V11:0.0258657,(V6:0.0058236,V7:0.0253639):0.0106469):0.0103785,((V2:0,((V4:0.0589702,V1:0.0425645):0.0223678,((V10:0.0151966,V8:0.181694):0.0334395,V3:0.0738101):0.00378898):0.000927666):0.00380613,(V5:0.00680393,V9:0.0466579):0.0318388):0.0131475):0.181864):8.71101e-07
## 3 V6:0.0058236
## 4 Za17:0
## 5 V7:0.0253639
## 6 (V6:0.0058236,V7:0.0253639):0.0106469
## 7 V4:0.0589702
## 8 ((V11:0.0258657,(V6:0.0058236,V7:0.0253639):0.0106469):0.0103785,((V2:0,((V4:0.0589702,V1:0.0425645):0.0223678,((V10:0.0151966,V8:0.181694):0.0334395,V3:0.0738101):0.00378898):0.000927666):0.00380613,(V5:0.00680393,V9:0.0466579):0.0318388):0.0131475):0.181864
## 9 V10:0.0151966
## 10 <NA>
## 11 V8:0.181694
## 12 (V10:0.0151966,V8:0.181694):0.0334395
## 13 V9:0.0466579
## 14 ((V2:0,((V4:0.0589702,V1:0.0425645):0.0223678,((V10:0.0151966,V8:0.181694):0.0334395,V3:0.0738101):0.00378898):0.000927666):0.00380613,(V5:0.00680393,V9:0.0466579):0.0318388):0.0131475
## 15 V1:0.0425645
## 16 (V4:0.0589702,V1:0.0425645):0.0223678
## 17 V11:0.0258657
## 18 (V11:0.0258657,(V6:0.0058236,V7:0.0253639):0.0106469):0.0103785
## 19 V3:0.0738101
## 20 ((V4:0.0589702,V1:0.0425645):0.0223678,((V10:0.0151966,V8:0.181694):0.0334395,V3:0.0738101):0.00378898):0.000927666
## 21 V2:0
## 22 (V2:0,((V4:0.0589702,V1:0.0425645):0.0223678,((V10:0.0151966,V8:0.181694):0.0334395,V3:0.0738101):0.00378898):0.000927666):0.00380613
## 23 V5:0.00680393
## 24 (V5:0.00680393,V9:0.0466579):0.0318388
## 25 (Zpa63:8.71101e-07,(Za17:0,((V11:0.0258657,(V6:0.0058236,V7:0.0253639):0.0106469):0.0103785,((V2:0,((V4:0.0589702,V1:0.0425645):0.0223678,((V10:0.0151966,V8:0.181694):0.0334395,V3:0.0738101):0.00378898):0.000927666):0.00380613,(V5:0.00680393,V9:0.0466579):0.0318388):0.0131475):0.181864):8.71101e-07);
## 26 <NA>
## 27 <NA>
## 28 <NA>
## 29 <NA>
## 30 ((V10:0.0151966,V8:0.181694):0.0334395,V3:0.0738101):0.00378898
## 31 <NA>
## 32 <NA>
## x y ymin ymax
## 1 8.711010e-07 0.96153846 0.92307692 1.00000000
## 2 8.711010e-07 0.84615385 0.00000000 0.92307692
## 3 2.087135e-01 0.73076923 0.69230769 0.76923077
## 4 8.711010e-07 0.88461538 0.84615385 0.92307692
## 5 2.282538e-01 0.65384615 0.61538462 0.69230769
## 6 2.028899e-01 0.69230769 0.61538462 0.76923077
## 7 2.810838e-01 0.50000000 0.46153846 0.53846154
## 8 1.818645e-01 0.61538462 0.00000000 0.84615385
## 9 2.407640e-01 0.34615385 0.30769231 0.38461538
## 10 1.977550e-01 NA 0.00000000 0.15384615
## 11 2.773615e-01 0.26923077 0.23076923 0.30769231
## 12 2.264316e-01 0.30769231 0.23076923 0.38461538
## 13 2.446166e-01 0.03846154 0.00000000 0.07692308
## 14 1.950120e-01 0.15384615 0.00000000 0.61538462
## 15 2.646781e-01 0.42307692 0.38461538 0.46153846
## 16 2.221136e-01 0.46153846 0.38461538 0.53846154
## 17 2.163304e-01 0.80769231 0.76923077 0.84615385
## 18 1.922430e-01 0.76923077 0.61538462 0.84615385
## 19 2.424522e-01 0.19230769 0.15384615 0.23076923
## 20 1.997458e-01 0.38461538 0.15384615 0.53846154
## 21 1.988181e-01 0.57692308 0.53846154 0.61538462
## 22 1.988181e-01 0.53846154 0.15384615 0.61538462
## 23 2.173447e-01 0.11538462 0.07692308 0.15384615
## 24 2.105408e-01 0.07692308 0.00000000 0.15384615
## 25 0.000000e+00 0.92307692 0.00000000 1.00000000
## 26 1.894269e-01 NA 0.00000000 0.61538462
## 27 2.566255e-01 NA 0.38461538 0.46153846
## 28 2.810838e-01 NA 0.38461538 0.46153846
## 29 1.922430e-01 NA 0.00000000 0.61538462
## 30 2.035347e-01 0.23076923 0.15384615 0.38461538
## 31 2.446166e-01 NA 0.00000000 0.07692308
## 32 8.932167e-02 NA 0.00000000 0.84615385
## [1] "0.176644 0.195011971101 0.210540791704741"
## [1] "0.728667 0.181864471101 0.192242951101"
## [1] "0.585243 0.222113567101 0.281083837101"
## [1] "0.662929 0.222113567101 0.281083837101"
## [1] "0.728667 0.181864471101 0.192242951101"
## [1] "1 0.210540791704741 0.244616577776658"
## [1] "0.491142 8.71101e-07 0.181864471101"
## [1] 8.711010e-07 2.087135e-01 8.711010e-07 2.282538e-01 2.810838e-01
## [6] 2.407640e-01 2.773615e-01 2.446166e-01 2.646781e-01 2.163304e-01
## [11] 2.424522e-01 1.988181e-01 2.173447e-01
## [1] 0.003
## [1] "mse 0.000965112171597633"
mig_events = c(0:6)
temp = tibble(mig_events = mig_events,
var_explained = sapply(paste0(PopStr_dir, "High_anc_coef_only_outgroups_positions.with_outgroups.m",
mig_events, ".treemix.out"), get_f))
ggplot(temp, aes(mig_events, var_explained)) +
geom_line() +
geom_point() +
geom_hline(yintercept = 0.995, color = "#cbf3f0", linetype = "dashed", size = 1) +
theme_bw() +
labs(x = "Number of infered migration events", y = "Proportion of explained variance",
title = str_wrap(paste0("Variance explained by the model of Treemix with",
" different number of migration events"), 70))
The genetic diversity was estimated per window of 10kb over 10 random draw of samples.
#rsync -avP alice@130.125.25.244:/data2/alice/WW_project/3_Sumstats_demography/Sample_list_V*_*_diversity_pi.tsv \
# ~/Documents/Postdoc_Bruce/Projects/WW_project/3_Sumstats_demography/
echo -e "Subset_samples\tChromosome\tStart\tStop\tPi\tTajimaD\tTheta" > \
~/Documents/Postdoc_Bruce/Projects/WW_project/3_Sumstats_demography/Diversity_per_cluster.tsv
cat ~/Documents/Postdoc_Bruce/Projects/WW_project/3_Sumstats_demography/Sample_list_V*_*_diversity_pi.tsv >> \
~/Documents/Postdoc_Bruce/Projects/WW_project/3_Sumstats_demography/Diversity_per_cluster.tsv
Let’s visualize the difference of genetic diversity per cluster.
diversity_values = read_tsv(paste0(Sumstats_dir, "Diversity_per_cluster.tsv"), na = "nan") %>%
dplyr::select(-Theta, -TajimaD)
diversity_values %>%
dplyr::mutate(Subset_samples = str_remove(Subset_samples, "Sample_list_")) %>%
separate(Subset_samples, into = c("Cluster", "Rep")) %>%
group_by(Cluster) %>%
dplyr::summarize(Pi = median(Pi, na.rm = T)) %>%
ggplot() +
geom_point(aes(x = Cluster, y = Pi, color = Cluster), size = 4) +
geom_segment(aes(x = Cluster, xend = Cluster, y = 0, yend = Pi, color = Cluster), size = 1) +
theme_bw()+
theme(axis.title.x = element_blank(), axis.text.x = element_text(angle = 40, hjust = 1, vjust = 1)) +
Color_Cluster
temp = diversity_values %>%
mutate(Subset_samples = str_remove(Subset_samples, "Sample_list_")) %>%
separate(Subset_samples, into = c("Cluster", "Rep")) %>%
group_by(Cluster, Chromosome, Start, Stop) %>%
#group_by(Cluster, Rep) %>%
dplyr::summarize(Pi = mean(Pi, na.rm = T))
pi_data = temp %>%
filter(!is.na(Cluster)) %>%
group_by(Cluster) %>%
dplyr::mutate(cluster_avg = median(Pi, na.rm = T))
#One-way ANOVA with blocks
##Define linear model
model = lm(Pi ~ Cluster,
data=pi_data)
summary(model) ### Will show overall p-value and r-squared
##
## Call:
## lm(formula = Pi ~ Cluster, data = pi_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.014185 -0.005772 -0.001738 0.004208 0.066385
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0057819 0.0001419 40.739 < 2e-16 ***
## ClusterV10 0.0050513 0.0002007 25.167 < 2e-16 ***
## ClusterV11 0.0080221 0.0002007 39.969 < 2e-16 ***
## ClusterV2 0.0075684 0.0002007 37.708 < 2e-16 ***
## ClusterV3 0.0031074 0.0002007 15.482 < 2e-16 ***
## ClusterV4 -0.0013933 0.0002007 -6.942 3.93e-12 ***
## ClusterV5 0.0061510 0.0002007 30.646 < 2e-16 ***
## ClusterV6 0.0084033 0.0002007 41.868 < 2e-16 ***
## ClusterV7 0.0060794 0.0002007 30.289 < 2e-16 ***
## ClusterV8 0.0031009 0.0002007 15.449 < 2e-16 ***
## ClusterV9 0.0023513 0.0002007 11.715 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.008584 on 40227 degrees of freedom
## Multiple R-squared: 0.1157, Adjusted R-squared: 0.1154
## F-statistic: 526.1 on 10 and 40227 DF, p-value: < 2.2e-16
##Conduct analysis of variance
Anova(model,type = "II")
## Anova Table (Type II tests)
##
## Response: Pi
## Sum Sq Df F value Pr(>F)
## Cluster 0.38765 10 526.12 < 2.2e-16 ***
## Residuals 2.96396 40227
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model)
##
## Call:
## lm(formula = Pi ~ Cluster, data = pi_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.014185 -0.005772 -0.001738 0.004208 0.066385
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0057819 0.0001419 40.739 < 2e-16 ***
## ClusterV10 0.0050513 0.0002007 25.167 < 2e-16 ***
## ClusterV11 0.0080221 0.0002007 39.969 < 2e-16 ***
## ClusterV2 0.0075684 0.0002007 37.708 < 2e-16 ***
## ClusterV3 0.0031074 0.0002007 15.482 < 2e-16 ***
## ClusterV4 -0.0013933 0.0002007 -6.942 3.93e-12 ***
## ClusterV5 0.0061510 0.0002007 30.646 < 2e-16 ***
## ClusterV6 0.0084033 0.0002007 41.868 < 2e-16 ***
## ClusterV7 0.0060794 0.0002007 30.289 < 2e-16 ***
## ClusterV8 0.0031009 0.0002007 15.449 < 2e-16 ***
## ClusterV9 0.0023513 0.0002007 11.715 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.008584 on 40227 degrees of freedom
## Multiple R-squared: 0.1157, Adjusted R-squared: 0.1154
## F-statistic: 526.1 on 10 and 40227 DF, p-value: < 2.2e-16
hist(residuals(model), col="darkgray")
#Post-hoc analysis: mean separation tests
marginal = lsmeans(model, ~ Cluster)
pairs(marginal, adjust="tukey")
## contrast estimate SE df t.ratio p.value
## V1 - V10 -5.05e-03 0.000201 40227 -25.167 <.0001
## V1 - V11 -8.02e-03 0.000201 40227 -39.969 <.0001
## V1 - V2 -7.57e-03 0.000201 40227 -37.708 <.0001
## V1 - V3 -3.11e-03 0.000201 40227 -15.482 <.0001
## V1 - V4 1.39e-03 0.000201 40227 6.942 <.0001
## V1 - V5 -6.15e-03 0.000201 40227 -30.646 <.0001
## V1 - V6 -8.40e-03 0.000201 40227 -41.868 <.0001
## V1 - V7 -6.08e-03 0.000201 40227 -30.289 <.0001
## V1 - V8 -3.10e-03 0.000201 40227 -15.449 <.0001
## V1 - V9 -2.35e-03 0.000201 40227 -11.715 <.0001
## V10 - V11 -2.97e-03 0.000201 40227 -14.802 <.0001
## V10 - V2 -2.52e-03 0.000201 40227 -12.541 <.0001
## V10 - V3 1.94e-03 0.000201 40227 9.685 <.0001
## V10 - V4 6.44e-03 0.000201 40227 32.109 <.0001
## V10 - V5 -1.10e-03 0.000201 40227 -5.479 <.0001
## V10 - V6 -3.35e-03 0.000201 40227 -16.701 <.0001
## V10 - V7 -1.03e-03 0.000201 40227 -5.122 <.0001
## V10 - V8 1.95e-03 0.000201 40227 9.718 <.0001
## V10 - V9 2.70e-03 0.000201 40227 13.452 <.0001
## V11 - V2 4.54e-04 0.000201 40227 2.261 0.4614
## V11 - V3 4.91e-03 0.000201 40227 24.486 <.0001
## V11 - V4 9.42e-03 0.000201 40227 46.911 <.0001
## V11 - V5 1.87e-03 0.000201 40227 9.322 <.0001
## V11 - V6 -3.81e-04 0.000201 40227 -1.899 0.7183
## V11 - V7 1.94e-03 0.000201 40227 9.679 <.0001
## V11 - V8 4.92e-03 0.000201 40227 24.519 <.0001
## V11 - V9 5.67e-03 0.000201 40227 28.254 <.0001
## V2 - V3 4.46e-03 0.000201 40227 22.226 <.0001
## V2 - V4 8.96e-03 0.000201 40227 44.650 <.0001
## V2 - V5 1.42e-03 0.000201 40227 7.062 <.0001
## V2 - V6 -8.35e-04 0.000201 40227 -4.159 0.0016
## V2 - V7 1.49e-03 0.000201 40227 7.419 <.0001
## V2 - V8 4.47e-03 0.000201 40227 22.259 <.0001
## V2 - V9 5.22e-03 0.000201 40227 25.993 <.0001
## V3 - V4 4.50e-03 0.000201 40227 22.424 <.0001
## V3 - V5 -3.04e-03 0.000201 40227 -15.164 <.0001
## V3 - V6 -5.30e-03 0.000201 40227 -26.385 <.0001
## V3 - V7 -2.97e-03 0.000201 40227 -14.807 <.0001
## V3 - V8 6.58e-06 0.000201 40227 0.033 1.0000
## V3 - V9 7.56e-04 0.000201 40227 3.767 0.0077
## V4 - V5 -7.54e-03 0.000201 40227 -37.588 <.0001
## V4 - V6 -9.80e-03 0.000201 40227 -48.809 <.0001
## V4 - V7 -7.47e-03 0.000201 40227 -37.231 <.0001
## V4 - V8 -4.49e-03 0.000201 40227 -22.391 <.0001
## V4 - V9 -3.74e-03 0.000201 40227 -18.657 <.0001
## V5 - V6 -2.25e-03 0.000201 40227 -11.221 <.0001
## V5 - V7 7.16e-05 0.000201 40227 0.357 1.0000
## V5 - V8 3.05e-03 0.000201 40227 15.197 <.0001
## V5 - V9 3.80e-03 0.000201 40227 18.931 <.0001
## V6 - V7 2.32e-03 0.000201 40227 11.578 <.0001
## V6 - V8 5.30e-03 0.000201 40227 26.418 <.0001
## V6 - V9 6.05e-03 0.000201 40227 30.153 <.0001
## V7 - V8 2.98e-03 0.000201 40227 14.840 <.0001
## V7 - V9 3.73e-03 0.000201 40227 18.574 <.0001
## V8 - V9 7.50e-04 0.000201 40227 3.734 0.0087
##
## P value adjustment: tukey method for comparing a family of 11 estimates
CLD = cld(marginal,
alpha = 0.05,
Letters = letters, ### Use lower-case letters for .group
adjust = "tukey") ### Tukey-adjusted p-values
CLD
## Cluster lsmean SE df lower.CL upper.CL .group
## V4 0.00439 0.000142 40227 0.00399 0.00479 a
## V1 0.00578 0.000142 40227 0.00538 0.00618 b
## V9 0.00813 0.000142 40227 0.00773 0.00853 c
## V8 0.00888 0.000142 40227 0.00848 0.00928 d
## V3 0.00889 0.000142 40227 0.00849 0.00929 d
## V10 0.01083 0.000142 40227 0.01043 0.01123 e
## V7 0.01186 0.000142 40227 0.01146 0.01226 f
## V5 0.01193 0.000142 40227 0.01153 0.01233 f
## V2 0.01335 0.000142 40227 0.01295 0.01375 g
## V11 0.01380 0.000142 40227 0.01340 0.01421 gh
## V6 0.01419 0.000142 40227 0.01378 0.01459 h
##
## Confidence level used: 0.95
## Conf-level adjustment: sidak method for 11 estimates
## P value adjustment: tukey method for comparing a family of 11 estimates
## significance level used: alpha = 0.05
## NOTE: Compact letter displays can be misleading
## because they show NON-findings rather than findings.
## Consider using 'pairs()', 'pwpp()', or 'pwpm()' instead.
CLD$.group=gsub(" ", "", CLD$.group)
### Plot
ggplot(pi_data, aes(x = reorder(Cluster, -Pi), y = Pi)) +
coord_flip() +
geom_boxplot(outlier.shape = NA, alpha = .4, color = "grey") +
geom_point(aes(color = Cluster, x = Cluster, y = cluster_avg), size = 3)+
theme_cowplot() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 40, hjust = 1)) +
labs (x = "", y = "Diversity (pi)") +
ylim(c(-0.001, 0.05)) +
geom_text(data = CLD, aes(x = Cluster, label = .group, y = 0.049), color = "black") +
Color_Cluster
There was a clear effect of the number of samples on the LD estimated, so I switched to subsets.
# Running the LD stats for 10 subsets per cluster
for i in {1..11} ;
do
for repet in {1..2} ;
do
echo $i $repet ;
sbatch -p normal.168h LD_decay.sh ./Directories_new.sh \
/data2/alice/WW_project/2_Population_structure/0_Nuclear_genome/Sample_list_V${i}_${repet}.args \
V${i}_${repet} ./temp.windows ;
done ;
done
# Gathering all tables into one
echo -e "Pop\tRepeat\tStart\tStop\tMean\tMedian\tSstdev" \
> /data2/alice/WW_project/2_Population_structure/0_Nuclear_genome/LD_per_cluster.tab ;
for i in {1..11} ;
do
for repet in {1..10} ;
do
awk -v pop=$i -v repet=$repet 'BEGIN {OFS = "\t"} {if ( $1 ~ /[0-9]/ ) {print pop, repet, $1, $2, $3, $4, $5 } }' \
/data2/alice/WW_project/2_Population_structure/0_Nuclear_genome/LD_V${i}_${repet}.tab \
>> /data2/alice/WW_project/2_Population_structure/0_Nuclear_genome/LD_per_cluster.tab;
done
done
The different dots are the values estimated on different subsets of samples (for some, the subsets are the size of the cluster so the variance is null). The line here is the median of the subset estimates ()
LD_data = read_delim(paste0(nuc_PS_dir, "LD_per_cluster.tab"),
delim = "\t") %>%
mutate(center = (Start + Stop) / 2) %>%
mutate(Cluster = paste0("V", Pop))
temp = LD_data %>% group_by(Cluster, center) %>%
dplyr::summarize(Median = mean(Median))
ggplot() +
geom_point(data = LD_data,
aes(x = center, y = Median, color = Cluster, shape = Cluster),
alpha = .2) +
geom_line(data = temp, aes(x = center, y = Median, color = Cluster, shape = Cluster)) +
geom_hline(yintercept = 0.2, linetype = "dashed")+
Color_Cluster + Shape_Cluster +
theme_bw()
ggplot() +
geom_point(data = LD_data,
aes(x = center, y = Median, color = Cluster, shape = Cluster),
alpha = .2) +
geom_line(data = temp, aes(x = center, y = Median, color = Cluster, shape = Cluster)) +
geom_hline(yintercept = 0.2, linetype = "dashed") +
Color_Cluster2 + Shape_Cluster +
theme_bw() +
scale_y_log10()
temp %>%
filter(Median >= 0.2) %>%
group_by(Cluster) %>%
dplyr::summarize(Center = max(center)) %>%
ggplot() +
geom_point(aes(x = Cluster, y = Center, color = Cluster), size = 4) +
geom_segment(aes(x = Cluster, xend = Cluster, y = 0, yend = Center, color = Cluster), size = 1) +
Color_Cluster +
theme_bw() +
scale_y_log10() +
labs(y = "Distance between SNPs with a Rsquared <= 0.2",
title = "LD decay, summary statistic") +
coord_flip()
I estimate the Fst between clusters and represent the results as a matrix with colors indicating the Fst values. The order of the clusters is related to their place in the tree from treemix (without any migration event).
for i in {1..11} ; do for j in {1..11} ; do python Divergence_with_scikit-allel_Fst.py --vcf_file ../1_Variant_calling/4_Joint_calling/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.biall_SNP.max-m-1.maf-0.05.thin-1000bp.recode.vcf --sample_list1 ../2_Population_structure/Sample_list_V${i}.args --sample_list2 ../2_Population_structure/Sample_list_V${j}.args --out_dir ../2_Population_structure/Divergence/ --no_header ; done ; done
echo -e "Subset1\tSubset2\tHudsons_Fst\tWeir_Cockerham_Fst" > \
../2_Population_structure/Divergence/Divergence.Fst_between_clusters.tsv
cat ../2_Population_structure/Divergence/Divergence.Fst.Sample_list_V*_vs_Sample_list_V*.tsv >> \
../2_Population_structure/Divergence/Divergence.Fst_between_clusters.tsv
ordering_table = tibble(Clusters = c("V1", "V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11"),
Order_tree = c(7,9,8,6,10,2,1,5,11,4,3))
Fst_values = read_tsv(paste0(PopStr_dir, "Divergence/Divergence.Fst_between_clusters.tsv"))
temp = Fst_values %>% dplyr::select(-Hudsons_Fst) %>%
mutate(Weir_Cockerham_Fst = ifelse(Weir_Cockerham_Fst < 0, 0, Weir_Cockerham_Fst),
Subset1 = str_remove(Subset1, "Sample_list_"),
Subset2 = str_remove(Subset2, "Sample_list_")) %>%
left_join(., ordering_table, by = c("Subset1" = "Clusters")) %>%
left_join(., ordering_table, by = c("Subset2" = "Clusters")) %>%
arrange(Order_tree.x, Order_tree.y) %>%
dplyr::select(-Order_tree.x, -Order_tree.y) %>%
pivot_wider(names_from = Subset2, values_from = Weir_Cockerham_Fst)
WC_Fst_values <- as.matrix(temp[,-1])
rownames(WC_Fst_values) <- temp[,1] %>% pull()
corrplot(WC_Fst_values, method="color", is.corr=FALSE,
tl.col = "black", tl.srt=45, tl.cex = 1,
addCoef.col = "black", col=colorRampPalette(c("white","#2ec4b6"))(200))
I am curious about isolation by distance in the different continents.
#Estimate geographic distances between samples
temp = dplyr::select(Zt_meta, ID_file, Latitude, Longitude) %>%
filter(!is.na(Latitude))
geo_distances = as.data.frame(geodist(x = temp, sequential = FALSE, measure = "geodesic"))
colnames(geo_distances) <- temp$ID_file
geo_distances$ID_file1 = temp$ID_file
geo_distances = as.tibble(geo_distances) %>%
pivot_longer(-ID_file1, names_to = "ID_file2", values_to = "Geo_distance")
#Run IBS with plink on cluster and import
# ${SOFTPATH}plink --distance square ibs --vcf ${vcf_dir}${VCFBasename}.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.max-m-80.vcf.gz --out ${vcf_dir}${VCFBasename}.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.max-m-80 --const-fid --not-chr 14-21 mt
#rsync -avP alice@130.125.25.244:/data2/alice/WW_project/1_Variant_calling/4_Joint_calling/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.max-m-80.mibs.id ~/Documents/Postdoc_Bruce/Projects/WW_project/2_Population_structure/0_Nuclear_genome/All_good_samples.mibs.id
#rsync -avP alice@130.125.25.244:/data2/alice/WW_project/1_Variant_calling/4_Joint_calling/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.max-m-80.mibs ~/Documents/Postdoc_Bruce/Projects/WW_project/2_Population_structure/0_Nuclear_genome/All_good_samples.mibs
#../Software/vcftools_jydu/src/cpp/vcftools --gzvcf ${vcf_dir}${VCFBasename}.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.max-m-80.vcf.gz --missing-indv --out temp
#rsync -avP alice@130.125.25.244:/home/alice/WW_PopGen/temp.imiss ~/Documents/Postdoc_Bruce/Projects/WW_project/2_Population_structure/0_Nuclear_genome/All_good_samples.miss
#Read genetic distances
ibs = read_tsv(paste0(nuc_PS_dir, "All_good_samples.mibs.id"), col_names = c("Whatever", "ID_file1")) %>%
dplyr::select(ID_file1) %>% pull()
related = read_tsv(paste0(nuc_PS_dir, "All_good_samples.mibs"), col_names = ibs) %>%
mutate(ID_file1 = ibs) %>%
pivot_longer(names_to = "ID_file2", values_to = "Relatedness", cols = -ID_file1)
distances = inner_join(related, geo_distances)
#To do: compare intra-continent and inter-continent
#To do: compare intra-cluster and inter-cluster
#
distances = distances %>%
inner_join(Zt_meta %>% dplyr::select(ID_file1 = ID_file, Continent1 = Continent)) %>%
inner_join(Zt_meta %>% dplyr::select(ID_file2 = ID_file, Continent2 = Continent)) %>%
mutate(Continent_comparison = ifelse(Continent1 == Continent2, "Intra", "Inter")) %>%
filter(Continent1 != "Asia") %>%
filter(ID_file1 != ID_file2)
#Dot plots
distances %>%
filter(Continent_comparison == "Intra") %>%
ggplot(aes(x = Geo_distance, y = Relatedness)) +
geom_point(aes(col = Continent1), shape = 1, alpha = 0.9) +
theme_bw() +
facet_wrap(vars(Continent1), scales = "free") +
Color_Continent +
stat_smooth(col = "grey20", method = "lm") +
stat_cor(aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~")),label.y = 1) +
stat_regline_equation(label.y = 1.015)
# Discretized distances
temp = distances %>%
filter(Continent_comparison == "Intra") %>%
filter(Geo_distance < 4000000) %>%
mutate(Geo_distance_disc = as.factor(500*round(Geo_distance/500000))) %>%
dplyr::count(Continent1, Geo_distance_disc) #%>%
#filter(n >= 100)
distances %>%
filter(Continent_comparison == "Intra") %>%
mutate(Geo_distance_disc = as.factor(500*round(Geo_distance/500000)))%>%
# inner_join(temp) %>%
filter(Geo_distance < 4000000) %>%
ggplot(aes(x = Geo_distance_disc, y = Relatedness, col = Continent1)) +
geom_boxplot(outlier.shape = 1) +
theme_bw() +
facet_wrap(vars(Continent1), scales = "free_y") +
Color_Continent +
geom_text(data = temp, aes(x = Geo_distance_disc, y = 1, label = n),
col = "black", angle = 90, size = 4) +
theme(axis.text.x = element_text(angle = 45, hjust =1, vjust = 1))
#Both
temp = distances %>%
filter(Continent_comparison == "Intra") %>%
#filter(Geo_distance < 4000000) %>%
mutate(Geo_distance_disc = as.factor(500000*round(Geo_distance/500000)))
temp = distances %>%
filter(Continent_comparison == "Intra") %>%
#filter(Geo_distance < 4000000) %>%
mutate(Geo_distance_disc = cut_width(Geo_distance, width = 500000))
ggplot() +
geom_boxplot(data = temp, aes(x = Geo_distance_disc, y = Relatedness), outlier.shape = NA) +
theme_bw() +
facet_wrap(vars(Continent1), scales = "free")
ggplot(data = filter(distances, Continent_comparison == "Intra"),
aes(x = Geo_distance, y = Relatedness, col = Continent1)) +
geom_point(shape = 19, alpha = 0.05) +
theme_bw() +
facet_wrap(vars(Continent1)) +
Color_Continent +
stat_smooth(col = "grey20", method = "lm") +
stat_cor(aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~")),label.y = 1) +
stat_regline_equation(label.y = 1.015)
As most continents contain several genetic clusters, it is possible that the effect would be at least partially driven by the higher proportion of isolates from different genetic clusters in the higher geographically distances. So here, I look at isolation-by-distance but this time per cluster instead of per continent.
temp = list()
for (i in c(1:chosen_K)){
temp[[i]] = read_tsv(paste0(PopStr_dir, "Sample_list_V", i, ".args"), col_names = "ID_file") %>%
mutate( Cluster = paste0("V", i)) }
list_pure_cluster = bind_rows(temp)
distances = distances %>%
inner_join(list_pure_cluster %>% dplyr::select(ID_file1 = ID_file, Cluster1 = Cluster)) %>%
inner_join(list_pure_cluster %>% dplyr::select(ID_file2 = ID_file, Cluster2 = Cluster)) %>%
mutate(Cluster_comparison = ifelse(Cluster1 == Cluster2, "Intra", "Inter"))
distances %>%
filter(Cluster_comparison == "Intra") %>%
# inner_join(temp) %>%
# filter(Geo_distance < 4000000) %>%
ggplot(aes(x = Geo_distance, y = Relatedness, col = Cluster1)) +
geom_point() +
theme_bw() +
facet_wrap(vars(Cluster1), scales = "free") +
theme(axis.text.x = element_text(angle = 45, hjust =1, vjust = 1))+
stat_smooth(col = "grey20", method = "lm") +
stat_cor(aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~")),label.y = 1) +
stat_regline_equation(label.y = 1.015)+
Color_Cluster
# Discretized distances
temp = distances %>%
filter(Cluster_comparison == "Intra") %>%
filter(Geo_distance < 4000000) %>%
mutate(Geo_distance_disc = as.factor(500*round(Geo_distance/500000))) %>%
dplyr::count(Cluster1, Geo_distance_disc) #%>%
#filter(n >= 100)
distances %>%
filter(Cluster_comparison == "Intra") %>%
mutate(Geo_distance_disc = as.factor(500*round(Geo_distance/500000)))%>%
#filter(Geo_distance < 4000000) %>%
ggplot(aes(x = Geo_distance_disc, y = Relatedness, col = Cluster1)) +
geom_boxplot(outlier.shape = 1) +
theme_bw() +
facet_wrap(vars(Cluster1), scales = "free_y") +
Color_Cluster +
geom_text(data = temp, aes(x = Geo_distance_disc, y = 1, label = n),
col = "black", angle = 90, size = 4) +
theme(axis.text.x = element_text(angle = 45, hjust =1, vjust = 1))